Spaces:
Runtime error
Web Search: Playwright, spatial parsing, markdown (#1094)
Browse files* feat: playwright, spatial parsing, markdown for web search
Co-authored-by: Aaditya Sahay <aadityasahay1@gmail.com>
* feat: choose multiple clusters if necessary (#2)
* chore: resolve linting failures
* feat: improve paring performance and error messages
* feat: combine embeddable chunks together on cpu
* feat: reduce parsed pages from 10 to 8
* feat: disable javascript in playwright by default
* feat: embedding and parsing error messages
* feat: move isURL, fix type errors, misc
* feat: misc cleanup
* feat: change serializedHtmlElement to interface
* fix: isUrl filename
* fix: add playwright dependencies to docker
* feat: add playwright browsers to docker image
* feat: enable javascript by default
* feat: remove error message from console on failed page
---------
Co-authored-by: Aaditya Sahay <aadityasahay1@gmail.com>
Co-authored-by: Aaditya Sahay <56438732+Aaditya-Sahay@users.noreply.github.com>
- .env +2 -1
- Dockerfile +6 -0
- README.md +2 -0
- package-lock.json +289 -27
- package.json +5 -0
- src/lib/components/chat/ChatMessage.svelte +3 -3
- src/lib/server/embeddingEndpoints/hfApi/embeddingHfApi.ts +6 -1
- src/lib/server/isURLLocal.ts +31 -19
- src/lib/server/preprocessMessages.ts +4 -6
- src/lib/server/sentenceSimilarity.ts +12 -21
- src/lib/server/websearch/embed/combine.ts +37 -0
- src/lib/server/websearch/embed/embed.ts +80 -0
- src/lib/server/websearch/embed/tree.ts +6 -0
- src/lib/server/websearch/markdown/fromHtml.ts +98 -0
- src/lib/server/websearch/markdown/tree.ts +63 -0
- src/lib/server/websearch/markdown/types.ts +55 -0
- src/lib/server/websearch/markdown/utils/chunk.ts +60 -0
- src/lib/server/websearch/markdown/utils/nlp.ts +11 -0
- src/lib/server/websearch/markdown/utils/stringify.ts +75 -0
- src/lib/server/websearch/parseWeb.ts +0 -41
- src/lib/server/websearch/runWebSearch.ts +69 -145
- src/lib/server/websearch/scrape/parser.ts +552 -0
- src/lib/server/websearch/scrape/playwright.ts +59 -0
- src/lib/server/websearch/scrape/scrape.ts +34 -0
- src/lib/server/websearch/scrape/types.ts +5 -0
- src/lib/server/websearch/search/endpoints.ts +27 -0
- src/lib/server/websearch/{searchSearxng.ts → search/endpoints/searxng.ts} +5 -3
- src/lib/server/websearch/search/endpoints/serpApi.ts +25 -0
- src/lib/server/websearch/search/endpoints/serpStack.ts +35 -0
- src/lib/server/websearch/search/endpoints/serper.ts +31 -0
- src/lib/server/websearch/{searchWebLocal.ts → search/endpoints/webLocal.ts} +16 -26
- src/lib/server/websearch/search/endpoints/youApi.ts +41 -0
- src/lib/server/websearch/{generateQuery.ts → search/generateQuery.ts} +1 -1
- src/lib/server/websearch/search/search.ts +77 -0
- src/lib/server/websearch/searchWeb.ts +0 -148
- src/lib/types/WebSearch.ts +16 -17
- src/lib/utils/isUrl.ts +8 -0
- src/lib/utils/timeout.ts +6 -3
|
@@ -27,6 +27,7 @@ SEARXNG_QUERY_URL=# where '<query>' will be replaced with query keywords see htt
|
|
| 27 |
|
| 28 |
WEBSEARCH_ALLOWLIST=`[]` # if it's defined, allow websites from only this list.
|
| 29 |
WEBSEARCH_BLOCKLIST=`[]` # if it's defined, block websites from this list.
|
|
|
|
| 30 |
|
| 31 |
# Parameters to enable open id login
|
| 32 |
OPENID_CONFIG=`{
|
|
@@ -155,4 +156,4 @@ ALLOWED_USER_EMAILS=`[]` # if it's defined, only these emails will be allowed to
|
|
| 155 |
USAGE_LIMITS=`{}`
|
| 156 |
ALLOW_INSECURE_COOKIES=false # recommended to keep this to false but set to true if you need to run over http without tls
|
| 157 |
METRICS_PORT=
|
| 158 |
-
LOG_LEVEL=info
|
|
|
|
| 27 |
|
| 28 |
WEBSEARCH_ALLOWLIST=`[]` # if it's defined, allow websites from only this list.
|
| 29 |
WEBSEARCH_BLOCKLIST=`[]` # if it's defined, block websites from this list.
|
| 30 |
+
WEBSEARCH_JAVASCRIPT=true # CPU usage reduces by 60% on average by disabling javascript. Enable to improve website compatibility
|
| 31 |
|
| 32 |
# Parameters to enable open id login
|
| 33 |
OPENID_CONFIG=`{
|
|
|
|
| 156 |
USAGE_LIMITS=`{}`
|
| 157 |
ALLOW_INSECURE_COOKIES=false # recommended to keep this to false but set to true if you need to run over http without tls
|
| 158 |
METRICS_PORT=
|
| 159 |
+
LOG_LEVEL=info
|
|
@@ -83,6 +83,12 @@ COPY --chown=1000 gcp-*.json /app/
|
|
| 83 |
COPY --from=builder --chown=1000 /app/build /app/build
|
| 84 |
COPY --from=builder --chown=1000 /app/node_modules /app/node_modules
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
RUN chmod +x /app/entrypoint.sh
|
| 87 |
|
| 88 |
CMD ["/bin/bash", "-c", "/app/entrypoint.sh"]
|
|
|
|
| 83 |
COPY --from=builder --chown=1000 /app/build /app/build
|
| 84 |
COPY --from=builder --chown=1000 /app/node_modules /app/node_modules
|
| 85 |
|
| 86 |
+
RUN npx playwright install
|
| 87 |
+
|
| 88 |
+
USER root
|
| 89 |
+
RUN npx playwright install-deps
|
| 90 |
+
USER user
|
| 91 |
+
|
| 92 |
RUN chmod +x /app/entrypoint.sh
|
| 93 |
|
| 94 |
CMD ["/bin/bash", "-c", "/app/entrypoint.sh"]
|
|
@@ -170,6 +170,8 @@ You can enable the web search through an API by adding `YDC_API_KEY` ([docs.you.
|
|
| 170 |
|
| 171 |
You can also simply enable the local google websearch by setting `USE_LOCAL_WEBSEARCH=true` in your `.env.local` or specify a SearXNG instance by adding the query URL to `SEARXNG_QUERY_URL`.
|
| 172 |
|
|
|
|
|
|
|
| 173 |
### Custom models
|
| 174 |
|
| 175 |
You can customize the parameters passed to the model or even use a new model by updating the `MODELS` variable in your `.env.local`. The default one can be found in `.env` and looks like this :
|
|
|
|
| 170 |
|
| 171 |
You can also simply enable the local google websearch by setting `USE_LOCAL_WEBSEARCH=true` in your `.env.local` or specify a SearXNG instance by adding the query URL to `SEARXNG_QUERY_URL`.
|
| 172 |
|
| 173 |
+
You can enable Javascript when parsing webpages to improve compatibility with `WEBSEARCH_JAVASCRIPT=true` at the cost of increased CPU usage. You'll want at least 4 cores when enabling.
|
| 174 |
+
|
| 175 |
### Custom models
|
| 176 |
|
| 177 |
You can customize the parameters passed to the model or even use a new model by updating the `MODELS` variable in your `.env.local`. The default one can be found in `.env` and looks like this :
|
|
@@ -8,9 +8,11 @@
|
|
| 8 |
"name": "chat-ui",
|
| 9 |
"version": "0.8.4",
|
| 10 |
"dependencies": {
|
|
|
|
| 11 |
"@huggingface/hub": "^0.5.1",
|
| 12 |
"@huggingface/inference": "^2.6.3",
|
| 13 |
"@iconify-json/bi": "^1.1.21",
|
|
|
|
| 14 |
"@resvg/resvg-js": "^2.6.0",
|
| 15 |
"@xenova/transformers": "^2.16.1",
|
| 16 |
"autoprefixer": "^10.4.14",
|
|
@@ -32,10 +34,12 @@
|
|
| 32 |
"parquetjs": "^0.11.2",
|
| 33 |
"pino": "^9.0.0",
|
| 34 |
"pino-pretty": "^11.0.0",
|
|
|
|
| 35 |
"postcss": "^8.4.31",
|
| 36 |
"saslprep": "^1.0.3",
|
| 37 |
"satori": "^0.10.11",
|
| 38 |
"satori-html": "^0.3.2",
|
|
|
|
| 39 |
"serpapi": "^1.1.1",
|
| 40 |
"sharp": "^0.33.2",
|
| 41 |
"tailwind-scrollbar": "^3.0.0",
|
|
@@ -55,6 +59,7 @@
|
|
| 55 |
"@types/jsdom": "^21.1.1",
|
| 56 |
"@types/minimist": "^1.2.5",
|
| 57 |
"@types/parquetjs": "^0.10.3",
|
|
|
|
| 58 |
"@types/uuid": "^9.0.8",
|
| 59 |
"@typescript-eslint/eslint-plugin": "^6.x",
|
| 60 |
"@typescript-eslint/parser": "^6.x",
|
|
@@ -159,39 +164,54 @@
|
|
| 159 |
}
|
| 160 |
},
|
| 161 |
"node_modules/@anthropic-ai/vertex-sdk": {
|
| 162 |
-
"version": "0.3.
|
| 163 |
-
"resolved": "https://registry.npmjs.org/@anthropic-ai/vertex-sdk/-/vertex-sdk-0.3.
|
| 164 |
-
"integrity": "sha512-
|
| 165 |
"optional": true,
|
| 166 |
"dependencies": {
|
| 167 |
-
"@anthropic-ai/sdk": "
|
| 168 |
"google-auth-library": "^9.4.2"
|
| 169 |
}
|
| 170 |
},
|
| 171 |
-
"node_modules/@
|
| 172 |
-
"version": "
|
| 173 |
-
"resolved": "https://registry.npmjs.org/@
|
| 174 |
-
"integrity": "sha512
|
| 175 |
-
"optional": true,
|
| 176 |
"dependencies": {
|
| 177 |
-
"@
|
| 178 |
-
"@
|
| 179 |
-
"
|
| 180 |
-
"
|
| 181 |
-
"
|
| 182 |
-
"
|
| 183 |
-
"
|
| 184 |
-
"
|
| 185 |
-
"web-streams-polyfill": "^3.2.1"
|
| 186 |
}
|
| 187 |
},
|
| 188 |
-
"node_modules/@
|
| 189 |
-
"version": "
|
| 190 |
-
"resolved": "https://registry.npmjs.org/
|
| 191 |
-
"integrity": "sha512-
|
| 192 |
-
"
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
}
|
| 196 |
},
|
| 197 |
"node_modules/@cspotcode/source-map-support": {
|
|
@@ -1314,6 +1334,18 @@
|
|
| 1314 |
"node": ">=8.0.0"
|
| 1315 |
}
|
| 1316 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1317 |
"node_modules/@polka/url": {
|
| 1318 |
"version": "1.0.0-next.21",
|
| 1319 |
"resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.21.tgz",
|
|
@@ -1374,6 +1406,43 @@
|
|
| 1374 |
"resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
|
| 1375 |
"integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="
|
| 1376 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1377 |
"node_modules/@resvg/resvg-js": {
|
| 1378 |
"version": "2.6.0",
|
| 1379 |
"resolved": "https://registry.npmjs.org/@resvg/resvg-js/-/resvg-js-2.6.0.tgz",
|
|
@@ -2063,6 +2132,15 @@
|
|
| 2063 |
"@types/chai": "*"
|
| 2064 |
}
|
| 2065 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2066 |
"node_modules/@types/connect": {
|
| 2067 |
"version": "3.4.38",
|
| 2068 |
"resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz",
|
|
@@ -2108,6 +2186,29 @@
|
|
| 2108 |
"@types/send": "*"
|
| 2109 |
}
|
| 2110 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2111 |
"node_modules/@types/http-errors": {
|
| 2112 |
"version": "2.0.4",
|
| 2113 |
"resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
|
|
@@ -2216,6 +2317,12 @@
|
|
| 2216 |
"integrity": "sha512-60BCwRFOZCQhDncwQdxxeOEEkbc5dIMccYLwbxsS4TUNeVECQ/pBJ0j09mrHOl/JJvpRPGwO9SvE4nR2Nb/a4Q==",
|
| 2217 |
"dev": true
|
| 2218 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2219 |
"node_modules/@types/semver": {
|
| 2220 |
"version": "7.5.3",
|
| 2221 |
"resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.3.tgz",
|
|
@@ -3660,7 +3767,6 @@
|
|
| 3660 |
"version": "4.3.1",
|
| 3661 |
"resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
|
| 3662 |
"integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
|
| 3663 |
-
"dev": true,
|
| 3664 |
"engines": {
|
| 3665 |
"node": ">=0.10.0"
|
| 3666 |
}
|
|
@@ -3791,6 +3897,30 @@
|
|
| 3791 |
"node": ">=6.0.0"
|
| 3792 |
}
|
| 3793 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3794 |
"node_modules/domexception": {
|
| 3795 |
"version": "4.0.0",
|
| 3796 |
"resolved": "https://registry.npmjs.org/domexception/-/domexception-4.0.0.tgz",
|
|
@@ -3802,6 +3932,33 @@
|
|
| 3802 |
"node": ">=12"
|
| 3803 |
}
|
| 3804 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3805 |
"node_modules/dotenv": {
|
| 3806 |
"version": "16.0.3",
|
| 3807 |
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.0.3.tgz",
|
|
@@ -3940,7 +4097,6 @@
|
|
| 3940 |
"version": "4.0.0",
|
| 3941 |
"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
|
| 3942 |
"integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
|
| 3943 |
-
"dev": true,
|
| 3944 |
"engines": {
|
| 3945 |
"node": ">=10"
|
| 3946 |
},
|
|
@@ -4924,6 +5080,24 @@
|
|
| 4924 |
"node": ">=12"
|
| 4925 |
}
|
| 4926 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4927 |
"node_modules/http-errors": {
|
| 4928 |
"version": "2.0.0",
|
| 4929 |
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
|
|
@@ -5194,6 +5368,14 @@
|
|
| 5194 |
"node": ">=8"
|
| 5195 |
}
|
| 5196 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5197 |
"node_modules/is-potential-custom-element-name": {
|
| 5198 |
"version": "1.0.1",
|
| 5199 |
"resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
|
|
@@ -6354,6 +6536,11 @@
|
|
| 6354 |
"hex-rgb": "^4.1.0"
|
| 6355 |
}
|
| 6356 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6357 |
"node_modules/parse5": {
|
| 6358 |
"version": "7.1.2",
|
| 6359 |
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
|
|
@@ -6645,6 +6832,47 @@
|
|
| 6645 |
"resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
|
| 6646 |
"integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg=="
|
| 6647 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6648 |
"node_modules/postcss": {
|
| 6649 |
"version": "8.4.35",
|
| 6650 |
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.35.tgz",
|
|
@@ -7431,6 +7659,19 @@
|
|
| 7431 |
"rimraf": "bin.js"
|
| 7432 |
}
|
| 7433 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7434 |
"node_modules/saslprep": {
|
| 7435 |
"version": "1.0.3",
|
| 7436 |
"resolved": "https://registry.npmjs.org/saslprep/-/saslprep-1.0.3.tgz",
|
|
@@ -7481,6 +7722,14 @@
|
|
| 7481 |
"node": ">=v12.22.7"
|
| 7482 |
}
|
| 7483 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7484 |
"node_modules/secure-json-parse": {
|
| 7485 |
"version": "2.7.0",
|
| 7486 |
"resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz",
|
|
@@ -8428,6 +8677,19 @@
|
|
| 8428 |
"node": ">=14.0.0"
|
| 8429 |
}
|
| 8430 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8431 |
"node_modules/to-regex-range": {
|
| 8432 |
"version": "5.0.1",
|
| 8433 |
"resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
|
|
|
|
| 8 |
"name": "chat-ui",
|
| 9 |
"version": "0.8.4",
|
| 10 |
"dependencies": {
|
| 11 |
+
"@cliqz/adblocker-playwright": "^1.27.2",
|
| 12 |
"@huggingface/hub": "^0.5.1",
|
| 13 |
"@huggingface/inference": "^2.6.3",
|
| 14 |
"@iconify-json/bi": "^1.1.21",
|
| 15 |
+
"@playwright/browser-chromium": "^1.43.1",
|
| 16 |
"@resvg/resvg-js": "^2.6.0",
|
| 17 |
"@xenova/transformers": "^2.16.1",
|
| 18 |
"autoprefixer": "^10.4.14",
|
|
|
|
| 34 |
"parquetjs": "^0.11.2",
|
| 35 |
"pino": "^9.0.0",
|
| 36 |
"pino-pretty": "^11.0.0",
|
| 37 |
+
"playwright": "^1.40.0",
|
| 38 |
"postcss": "^8.4.31",
|
| 39 |
"saslprep": "^1.0.3",
|
| 40 |
"satori": "^0.10.11",
|
| 41 |
"satori-html": "^0.3.2",
|
| 42 |
+
"sbd": "^1.0.19",
|
| 43 |
"serpapi": "^1.1.1",
|
| 44 |
"sharp": "^0.33.2",
|
| 45 |
"tailwind-scrollbar": "^3.0.0",
|
|
|
|
| 59 |
"@types/jsdom": "^21.1.1",
|
| 60 |
"@types/minimist": "^1.2.5",
|
| 61 |
"@types/parquetjs": "^0.10.3",
|
| 62 |
+
"@types/sbd": "^1.0.5",
|
| 63 |
"@types/uuid": "^9.0.8",
|
| 64 |
"@typescript-eslint/eslint-plugin": "^6.x",
|
| 65 |
"@typescript-eslint/parser": "^6.x",
|
|
|
|
| 164 |
}
|
| 165 |
},
|
| 166 |
"node_modules/@anthropic-ai/vertex-sdk": {
|
| 167 |
+
"version": "0.3.6",
|
| 168 |
+
"resolved": "https://registry.npmjs.org/@anthropic-ai/vertex-sdk/-/vertex-sdk-0.3.6.tgz",
|
| 169 |
+
"integrity": "sha512-4pNVobcCsPCWLSaFJkT/XxwX5rmot+q2PE2LF5vfuRNFTWFjeTrsPgTB48D0Sce/c/2p4fddrFKGN6fdnn8zRg==",
|
| 170 |
"optional": true,
|
| 171 |
"dependencies": {
|
| 172 |
+
"@anthropic-ai/sdk": ">=0.14 <1",
|
| 173 |
"google-auth-library": "^9.4.2"
|
| 174 |
}
|
| 175 |
},
|
| 176 |
+
"node_modules/@cliqz/adblocker": {
|
| 177 |
+
"version": "1.27.2",
|
| 178 |
+
"resolved": "https://registry.npmjs.org/@cliqz/adblocker/-/adblocker-1.27.2.tgz",
|
| 179 |
+
"integrity": "sha512-sFjbx9xBGWaOsvVFVHVUNOrzCafGtjYDAp95KTeoJcNZbPs4D2RsabYZEeg4JkwPkfhcFseJqfnsMyJ4XsqVfQ==",
|
|
|
|
| 180 |
"dependencies": {
|
| 181 |
+
"@cliqz/adblocker-content": "^1.27.2",
|
| 182 |
+
"@cliqz/adblocker-extended-selectors": "^1.27.2",
|
| 183 |
+
"@remusao/guess-url-type": "^1.2.1",
|
| 184 |
+
"@remusao/small": "^1.2.1",
|
| 185 |
+
"@remusao/smaz": "^1.9.1",
|
| 186 |
+
"@types/chrome": "^0.0.266",
|
| 187 |
+
"@types/firefox-webext-browser": "^120.0.0",
|
| 188 |
+
"tldts-experimental": "^6.0.14"
|
|
|
|
| 189 |
}
|
| 190 |
},
|
| 191 |
+
"node_modules/@cliqz/adblocker-content": {
|
| 192 |
+
"version": "1.27.2",
|
| 193 |
+
"resolved": "https://registry.npmjs.org/@cliqz/adblocker-content/-/adblocker-content-1.27.2.tgz",
|
| 194 |
+
"integrity": "sha512-fzxsOt7r3YUgxoyW9GPCOShKOLNbB4n3gWtyMBFQ+lwHsQKfLehxN4Zxjg4Ad6AXJNW4gfIBq69ghnj2jHfviw==",
|
| 195 |
+
"dependencies": {
|
| 196 |
+
"@cliqz/adblocker-extended-selectors": "^1.27.2"
|
| 197 |
+
}
|
| 198 |
+
},
|
| 199 |
+
"node_modules/@cliqz/adblocker-extended-selectors": {
|
| 200 |
+
"version": "1.27.2",
|
| 201 |
+
"resolved": "https://registry.npmjs.org/@cliqz/adblocker-extended-selectors/-/adblocker-extended-selectors-1.27.2.tgz",
|
| 202 |
+
"integrity": "sha512-HZ03U8pAOuEwTo1vZ9tv49kIC4riWqYvr5p3illZshxo+eCUi8CPbgYSyYCtgd1JpO1wNnCwEX95/twXfT8cnA=="
|
| 203 |
+
},
|
| 204 |
+
"node_modules/@cliqz/adblocker-playwright": {
|
| 205 |
+
"version": "1.27.2",
|
| 206 |
+
"resolved": "https://registry.npmjs.org/@cliqz/adblocker-playwright/-/adblocker-playwright-1.27.2.tgz",
|
| 207 |
+
"integrity": "sha512-b+OoWKz/h787YItfCwjnhZ8l6/bv/DPTzaq1pyyY6Ovpdd+dGvVW1fehw+87FC6j/WQbTeuOdpLiwp8ouvrftg==",
|
| 208 |
+
"dependencies": {
|
| 209 |
+
"@cliqz/adblocker": "^1.27.2",
|
| 210 |
+
"@cliqz/adblocker-content": "^1.27.2",
|
| 211 |
+
"tldts-experimental": "^6.0.14"
|
| 212 |
+
},
|
| 213 |
+
"peerDependencies": {
|
| 214 |
+
"playwright": "^1.x"
|
| 215 |
}
|
| 216 |
},
|
| 217 |
"node_modules/@cspotcode/source-map-support": {
|
|
|
|
| 1334 |
"node": ">=8.0.0"
|
| 1335 |
}
|
| 1336 |
},
|
| 1337 |
+
"node_modules/@playwright/browser-chromium": {
|
| 1338 |
+
"version": "1.43.1",
|
| 1339 |
+
"resolved": "https://registry.npmjs.org/@playwright/browser-chromium/-/browser-chromium-1.43.1.tgz",
|
| 1340 |
+
"integrity": "sha512-CBuHhRIF/VGyUnPvK7/4IUbm0AAOZZI5huHlr+qNr5cFQpQ6TXBqOwSMef/xUz9HcjxWOxDPION7br1kOlyV/A==",
|
| 1341 |
+
"hasInstallScript": true,
|
| 1342 |
+
"dependencies": {
|
| 1343 |
+
"playwright-core": "1.43.1"
|
| 1344 |
+
},
|
| 1345 |
+
"engines": {
|
| 1346 |
+
"node": ">=16"
|
| 1347 |
+
}
|
| 1348 |
+
},
|
| 1349 |
"node_modules/@polka/url": {
|
| 1350 |
"version": "1.0.0-next.21",
|
| 1351 |
"resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.21.tgz",
|
|
|
|
| 1406 |
"resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
|
| 1407 |
"integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw=="
|
| 1408 |
},
|
| 1409 |
+
"node_modules/@remusao/guess-url-type": {
|
| 1410 |
+
"version": "1.2.1",
|
| 1411 |
+
"resolved": "https://registry.npmjs.org/@remusao/guess-url-type/-/guess-url-type-1.2.1.tgz",
|
| 1412 |
+
"integrity": "sha512-rbOqre2jW8STjheOsOaQHLgYBaBZ9Owbdt8NO7WvNZftJlaG3y/K9oOkl8ZUpuFBisIhmBuMEW6c+YrQl5inRA=="
|
| 1413 |
+
},
|
| 1414 |
+
"node_modules/@remusao/small": {
|
| 1415 |
+
"version": "1.2.1",
|
| 1416 |
+
"resolved": "https://registry.npmjs.org/@remusao/small/-/small-1.2.1.tgz",
|
| 1417 |
+
"integrity": "sha512-7MjoGt0TJMVw1GPKgWq6SJPws1SLsUXQRa43Umht+nkyw2jnpy3WpiLNqGdwo5rHr5Wp9B2W/Pm5RQp656UJdw=="
|
| 1418 |
+
},
|
| 1419 |
+
"node_modules/@remusao/smaz": {
|
| 1420 |
+
"version": "1.9.1",
|
| 1421 |
+
"resolved": "https://registry.npmjs.org/@remusao/smaz/-/smaz-1.9.1.tgz",
|
| 1422 |
+
"integrity": "sha512-e6BLuP8oaXCZ9+v46Is4ilAZ/Vq6YLgmBP204Ixgk1qTjXmqvFYG7+AS7v9nsZdGOy96r9DWGFbbDVgMxwu1rA==",
|
| 1423 |
+
"dependencies": {
|
| 1424 |
+
"@remusao/smaz-compress": "^1.9.1",
|
| 1425 |
+
"@remusao/smaz-decompress": "^1.9.1"
|
| 1426 |
+
}
|
| 1427 |
+
},
|
| 1428 |
+
"node_modules/@remusao/smaz-compress": {
|
| 1429 |
+
"version": "1.9.1",
|
| 1430 |
+
"resolved": "https://registry.npmjs.org/@remusao/smaz-compress/-/smaz-compress-1.9.1.tgz",
|
| 1431 |
+
"integrity": "sha512-E2f48TwloQu3r6BdLOGF2aczeH7bJ/32oJGqvzT9SKur0cuUnLcZ7ZXP874E2fwmdE+cXzfC7bKzp79cDnmeyw==",
|
| 1432 |
+
"dependencies": {
|
| 1433 |
+
"@remusao/trie": "^1.4.1"
|
| 1434 |
+
}
|
| 1435 |
+
},
|
| 1436 |
+
"node_modules/@remusao/smaz-decompress": {
|
| 1437 |
+
"version": "1.9.1",
|
| 1438 |
+
"resolved": "https://registry.npmjs.org/@remusao/smaz-decompress/-/smaz-decompress-1.9.1.tgz",
|
| 1439 |
+
"integrity": "sha512-TfjKKprYe3n47od8auhvJ/Ikj9kQTbDTe71ynKlxslrvvUhlIV3VQSuwYuMWMbdz1fIs0H/fxCN1Z8/H3km6/A=="
|
| 1440 |
+
},
|
| 1441 |
+
"node_modules/@remusao/trie": {
|
| 1442 |
+
"version": "1.4.1",
|
| 1443 |
+
"resolved": "https://registry.npmjs.org/@remusao/trie/-/trie-1.4.1.tgz",
|
| 1444 |
+
"integrity": "sha512-yvwa+aCyYI/UjeD39BnpMypG8N06l86wIDW1/PAc6ihBRnodIfZDwccxQN3n1t74wduzaz74m4ZMHZnB06567Q=="
|
| 1445 |
+
},
|
| 1446 |
"node_modules/@resvg/resvg-js": {
|
| 1447 |
"version": "2.6.0",
|
| 1448 |
"resolved": "https://registry.npmjs.org/@resvg/resvg-js/-/resvg-js-2.6.0.tgz",
|
|
|
|
| 2132 |
"@types/chai": "*"
|
| 2133 |
}
|
| 2134 |
},
|
| 2135 |
+
"node_modules/@types/chrome": {
|
| 2136 |
+
"version": "0.0.266",
|
| 2137 |
+
"resolved": "https://registry.npmjs.org/@types/chrome/-/chrome-0.0.266.tgz",
|
| 2138 |
+
"integrity": "sha512-QSQWJTL7NjZElvq/6/E5C1+pHgEP8UAJzwoz7M4vSJ7AECt6NNehJ+tU6snnvuTqZOBjFCivvitYo5+8tNPmhg==",
|
| 2139 |
+
"dependencies": {
|
| 2140 |
+
"@types/filesystem": "*",
|
| 2141 |
+
"@types/har-format": "*"
|
| 2142 |
+
}
|
| 2143 |
+
},
|
| 2144 |
"node_modules/@types/connect": {
|
| 2145 |
"version": "3.4.38",
|
| 2146 |
"resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz",
|
|
|
|
| 2186 |
"@types/send": "*"
|
| 2187 |
}
|
| 2188 |
},
|
| 2189 |
+
"node_modules/@types/filesystem": {
|
| 2190 |
+
"version": "0.0.36",
|
| 2191 |
+
"resolved": "https://registry.npmjs.org/@types/filesystem/-/filesystem-0.0.36.tgz",
|
| 2192 |
+
"integrity": "sha512-vPDXOZuannb9FZdxgHnqSwAG/jvdGM8Wq+6N4D/d80z+D4HWH+bItqsZaVRQykAn6WEVeEkLm2oQigyHtgb0RA==",
|
| 2193 |
+
"dependencies": {
|
| 2194 |
+
"@types/filewriter": "*"
|
| 2195 |
+
}
|
| 2196 |
+
},
|
| 2197 |
+
"node_modules/@types/filewriter": {
|
| 2198 |
+
"version": "0.0.33",
|
| 2199 |
+
"resolved": "https://registry.npmjs.org/@types/filewriter/-/filewriter-0.0.33.tgz",
|
| 2200 |
+
"integrity": "sha512-xFU8ZXTw4gd358lb2jw25nxY9QAgqn2+bKKjKOYfNCzN4DKCFetK7sPtrlpg66Ywe3vWY9FNxprZawAh9wfJ3g=="
|
| 2201 |
+
},
|
| 2202 |
+
"node_modules/@types/firefox-webext-browser": {
|
| 2203 |
+
"version": "120.0.3",
|
| 2204 |
+
"resolved": "https://registry.npmjs.org/@types/firefox-webext-browser/-/firefox-webext-browser-120.0.3.tgz",
|
| 2205 |
+
"integrity": "sha512-APbBSxOvFMbKwXy/4YrEVa5Di6N0C9yl4w0WA0xzdkOrChAfPQ/KlcC8QLyhemHCHpF1CB/zHy52+oUQurViOg=="
|
| 2206 |
+
},
|
| 2207 |
+
"node_modules/@types/har-format": {
|
| 2208 |
+
"version": "1.2.15",
|
| 2209 |
+
"resolved": "https://registry.npmjs.org/@types/har-format/-/har-format-1.2.15.tgz",
|
| 2210 |
+
"integrity": "sha512-RpQH4rXLuvTXKR0zqHq3go0RVXYv/YVqv4TnPH95VbwUxZdQlK1EtcMvQvMpDngHbt13Csh9Z4qT9AbkiQH5BA=="
|
| 2211 |
+
},
|
| 2212 |
"node_modules/@types/http-errors": {
|
| 2213 |
"version": "2.0.4",
|
| 2214 |
"resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
|
|
|
|
| 2317 |
"integrity": "sha512-60BCwRFOZCQhDncwQdxxeOEEkbc5dIMccYLwbxsS4TUNeVECQ/pBJ0j09mrHOl/JJvpRPGwO9SvE4nR2Nb/a4Q==",
|
| 2318 |
"dev": true
|
| 2319 |
},
|
| 2320 |
+
"node_modules/@types/sbd": {
|
| 2321 |
+
"version": "1.0.5",
|
| 2322 |
+
"resolved": "https://registry.npmjs.org/@types/sbd/-/sbd-1.0.5.tgz",
|
| 2323 |
+
"integrity": "sha512-60PxBBWhg0C3yb5bTP+wwWYGTKMcuB0S6mTEa1sedMC79tYY0Ei7YjU4qsWzGn++lWscLQde16SnElJrf5/aTw==",
|
| 2324 |
+
"dev": true
|
| 2325 |
+
},
|
| 2326 |
"node_modules/@types/semver": {
|
| 2327 |
"version": "7.5.3",
|
| 2328 |
"resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.3.tgz",
|
|
|
|
| 3767 |
"version": "4.3.1",
|
| 3768 |
"resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz",
|
| 3769 |
"integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
|
|
|
|
| 3770 |
"engines": {
|
| 3771 |
"node": ">=0.10.0"
|
| 3772 |
}
|
|
|
|
| 3897 |
"node": ">=6.0.0"
|
| 3898 |
}
|
| 3899 |
},
|
| 3900 |
+
"node_modules/dom-serializer": {
|
| 3901 |
+
"version": "2.0.0",
|
| 3902 |
+
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
|
| 3903 |
+
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
|
| 3904 |
+
"dependencies": {
|
| 3905 |
+
"domelementtype": "^2.3.0",
|
| 3906 |
+
"domhandler": "^5.0.2",
|
| 3907 |
+
"entities": "^4.2.0"
|
| 3908 |
+
},
|
| 3909 |
+
"funding": {
|
| 3910 |
+
"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
|
| 3911 |
+
}
|
| 3912 |
+
},
|
| 3913 |
+
"node_modules/domelementtype": {
|
| 3914 |
+
"version": "2.3.0",
|
| 3915 |
+
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
|
| 3916 |
+
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
|
| 3917 |
+
"funding": [
|
| 3918 |
+
{
|
| 3919 |
+
"type": "github",
|
| 3920 |
+
"url": "https://github.com/sponsors/fb55"
|
| 3921 |
+
}
|
| 3922 |
+
]
|
| 3923 |
+
},
|
| 3924 |
"node_modules/domexception": {
|
| 3925 |
"version": "4.0.0",
|
| 3926 |
"resolved": "https://registry.npmjs.org/domexception/-/domexception-4.0.0.tgz",
|
|
|
|
| 3932 |
"node": ">=12"
|
| 3933 |
}
|
| 3934 |
},
|
| 3935 |
+
"node_modules/domhandler": {
|
| 3936 |
+
"version": "5.0.3",
|
| 3937 |
+
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
|
| 3938 |
+
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
|
| 3939 |
+
"dependencies": {
|
| 3940 |
+
"domelementtype": "^2.3.0"
|
| 3941 |
+
},
|
| 3942 |
+
"engines": {
|
| 3943 |
+
"node": ">= 4"
|
| 3944 |
+
},
|
| 3945 |
+
"funding": {
|
| 3946 |
+
"url": "https://github.com/fb55/domhandler?sponsor=1"
|
| 3947 |
+
}
|
| 3948 |
+
},
|
| 3949 |
+
"node_modules/domutils": {
|
| 3950 |
+
"version": "3.1.0",
|
| 3951 |
+
"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz",
|
| 3952 |
+
"integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==",
|
| 3953 |
+
"dependencies": {
|
| 3954 |
+
"dom-serializer": "^2.0.0",
|
| 3955 |
+
"domelementtype": "^2.3.0",
|
| 3956 |
+
"domhandler": "^5.0.3"
|
| 3957 |
+
},
|
| 3958 |
+
"funding": {
|
| 3959 |
+
"url": "https://github.com/fb55/domutils?sponsor=1"
|
| 3960 |
+
}
|
| 3961 |
+
},
|
| 3962 |
"node_modules/dotenv": {
|
| 3963 |
"version": "16.0.3",
|
| 3964 |
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.0.3.tgz",
|
|
|
|
| 4097 |
"version": "4.0.0",
|
| 4098 |
"resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
|
| 4099 |
"integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
|
|
|
|
| 4100 |
"engines": {
|
| 4101 |
"node": ">=10"
|
| 4102 |
},
|
|
|
|
| 5080 |
"node": ">=12"
|
| 5081 |
}
|
| 5082 |
},
|
| 5083 |
+
"node_modules/htmlparser2": {
|
| 5084 |
+
"version": "8.0.2",
|
| 5085 |
+
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
|
| 5086 |
+
"integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==",
|
| 5087 |
+
"funding": [
|
| 5088 |
+
"https://github.com/fb55/htmlparser2?sponsor=1",
|
| 5089 |
+
{
|
| 5090 |
+
"type": "github",
|
| 5091 |
+
"url": "https://github.com/sponsors/fb55"
|
| 5092 |
+
}
|
| 5093 |
+
],
|
| 5094 |
+
"dependencies": {
|
| 5095 |
+
"domelementtype": "^2.3.0",
|
| 5096 |
+
"domhandler": "^5.0.3",
|
| 5097 |
+
"domutils": "^3.0.1",
|
| 5098 |
+
"entities": "^4.4.0"
|
| 5099 |
+
}
|
| 5100 |
+
},
|
| 5101 |
"node_modules/http-errors": {
|
| 5102 |
"version": "2.0.0",
|
| 5103 |
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
|
|
|
|
| 5368 |
"node": ">=8"
|
| 5369 |
}
|
| 5370 |
},
|
| 5371 |
+
"node_modules/is-plain-object": {
|
| 5372 |
+
"version": "5.0.0",
|
| 5373 |
+
"resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-5.0.0.tgz",
|
| 5374 |
+
"integrity": "sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q==",
|
| 5375 |
+
"engines": {
|
| 5376 |
+
"node": ">=0.10.0"
|
| 5377 |
+
}
|
| 5378 |
+
},
|
| 5379 |
"node_modules/is-potential-custom-element-name": {
|
| 5380 |
"version": "1.0.1",
|
| 5381 |
"resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
|
|
|
|
| 6536 |
"hex-rgb": "^4.1.0"
|
| 6537 |
}
|
| 6538 |
},
|
| 6539 |
+
"node_modules/parse-srcset": {
|
| 6540 |
+
"version": "1.0.2",
|
| 6541 |
+
"resolved": "https://registry.npmjs.org/parse-srcset/-/parse-srcset-1.0.2.tgz",
|
| 6542 |
+
"integrity": "sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q=="
|
| 6543 |
+
},
|
| 6544 |
"node_modules/parse5": {
|
| 6545 |
"version": "7.1.2",
|
| 6546 |
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.1.2.tgz",
|
|
|
|
| 6832 |
"resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
|
| 6833 |
"integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg=="
|
| 6834 |
},
|
| 6835 |
+
"node_modules/playwright": {
|
| 6836 |
+
"version": "1.43.1",
|
| 6837 |
+
"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.43.1.tgz",
|
| 6838 |
+
"integrity": "sha512-V7SoH0ai2kNt1Md9E3Gwas5B9m8KR2GVvwZnAI6Pg0m3sh7UvgiYhRrhsziCmqMJNouPckiOhk8T+9bSAK0VIA==",
|
| 6839 |
+
"dependencies": {
|
| 6840 |
+
"playwright-core": "1.43.1"
|
| 6841 |
+
},
|
| 6842 |
+
"bin": {
|
| 6843 |
+
"playwright": "cli.js"
|
| 6844 |
+
},
|
| 6845 |
+
"engines": {
|
| 6846 |
+
"node": ">=16"
|
| 6847 |
+
},
|
| 6848 |
+
"optionalDependencies": {
|
| 6849 |
+
"fsevents": "2.3.2"
|
| 6850 |
+
}
|
| 6851 |
+
},
|
| 6852 |
+
"node_modules/playwright-core": {
|
| 6853 |
+
"version": "1.43.1",
|
| 6854 |
+
"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.43.1.tgz",
|
| 6855 |
+
"integrity": "sha512-EI36Mto2Vrx6VF7rm708qSnesVQKbxEWvPrfA1IPY6HgczBplDx7ENtx+K2n4kJ41sLLkuGfmb0ZLSSXlDhqPg==",
|
| 6856 |
+
"bin": {
|
| 6857 |
+
"playwright-core": "cli.js"
|
| 6858 |
+
},
|
| 6859 |
+
"engines": {
|
| 6860 |
+
"node": ">=16"
|
| 6861 |
+
}
|
| 6862 |
+
},
|
| 6863 |
+
"node_modules/playwright/node_modules/fsevents": {
|
| 6864 |
+
"version": "2.3.2",
|
| 6865 |
+
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
|
| 6866 |
+
"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
|
| 6867 |
+
"hasInstallScript": true,
|
| 6868 |
+
"optional": true,
|
| 6869 |
+
"os": [
|
| 6870 |
+
"darwin"
|
| 6871 |
+
],
|
| 6872 |
+
"engines": {
|
| 6873 |
+
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
|
| 6874 |
+
}
|
| 6875 |
+
},
|
| 6876 |
"node_modules/postcss": {
|
| 6877 |
"version": "8.4.35",
|
| 6878 |
"resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.35.tgz",
|
|
|
|
| 7659 |
"rimraf": "bin.js"
|
| 7660 |
}
|
| 7661 |
},
|
| 7662 |
+
"node_modules/sanitize-html": {
|
| 7663 |
+
"version": "2.13.0",
|
| 7664 |
+
"resolved": "https://registry.npmjs.org/sanitize-html/-/sanitize-html-2.13.0.tgz",
|
| 7665 |
+
"integrity": "sha512-Xff91Z+4Mz5QiNSLdLWwjgBDm5b1RU6xBT0+12rapjiaR7SwfRdjw8f+6Rir2MXKLrDicRFHdb51hGOAxmsUIA==",
|
| 7666 |
+
"dependencies": {
|
| 7667 |
+
"deepmerge": "^4.2.2",
|
| 7668 |
+
"escape-string-regexp": "^4.0.0",
|
| 7669 |
+
"htmlparser2": "^8.0.0",
|
| 7670 |
+
"is-plain-object": "^5.0.0",
|
| 7671 |
+
"parse-srcset": "^1.0.2",
|
| 7672 |
+
"postcss": "^8.3.11"
|
| 7673 |
+
}
|
| 7674 |
+
},
|
| 7675 |
"node_modules/saslprep": {
|
| 7676 |
"version": "1.0.3",
|
| 7677 |
"resolved": "https://registry.npmjs.org/saslprep/-/saslprep-1.0.3.tgz",
|
|
|
|
| 7722 |
"node": ">=v12.22.7"
|
| 7723 |
}
|
| 7724 |
},
|
| 7725 |
+
"node_modules/sbd": {
|
| 7726 |
+
"version": "1.0.19",
|
| 7727 |
+
"resolved": "https://registry.npmjs.org/sbd/-/sbd-1.0.19.tgz",
|
| 7728 |
+
"integrity": "sha512-b5RyZMGSrFuIB4AHdbv12uYHS8YGEJ36gtuvG3RflbJGY+T0dXmAL0E4vZjQqT2RsX0v+ZwVqhV2zsGr5aFK9w==",
|
| 7729 |
+
"dependencies": {
|
| 7730 |
+
"sanitize-html": "^2.3.2"
|
| 7731 |
+
}
|
| 7732 |
+
},
|
| 7733 |
"node_modules/secure-json-parse": {
|
| 7734 |
"version": "2.7.0",
|
| 7735 |
"resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz",
|
|
|
|
| 8677 |
"node": ">=14.0.0"
|
| 8678 |
}
|
| 8679 |
},
|
| 8680 |
+
"node_modules/tldts-core": {
|
| 8681 |
+
"version": "6.1.18",
|
| 8682 |
+
"resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-6.1.18.tgz",
|
| 8683 |
+
"integrity": "sha512-e4wx32F/7dMBSZyKAx825Yte3U0PQtZZ0bkWxYQiwLteRVnQ5zM40fEbi0IyNtwQssgJAk3GCr7Q+w39hX0VKA=="
|
| 8684 |
+
},
|
| 8685 |
+
"node_modules/tldts-experimental": {
|
| 8686 |
+
"version": "6.1.18",
|
| 8687 |
+
"resolved": "https://registry.npmjs.org/tldts-experimental/-/tldts-experimental-6.1.18.tgz",
|
| 8688 |
+
"integrity": "sha512-E9/pAIybo7/MPdsQSKcCDElgObk78Be1gFqO645LbfhL5HG597sOeRQ55EuvIHlTo1Ypyyl+F/V+p0CnrTu3uQ==",
|
| 8689 |
+
"dependencies": {
|
| 8690 |
+
"tldts-core": "^6.1.18"
|
| 8691 |
+
}
|
| 8692 |
+
},
|
| 8693 |
"node_modules/to-regex-range": {
|
| 8694 |
"version": "5.0.1",
|
| 8695 |
"resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
|
|
@@ -28,6 +28,7 @@
|
|
| 28 |
"@types/jsdom": "^21.1.1",
|
| 29 |
"@types/minimist": "^1.2.5",
|
| 30 |
"@types/parquetjs": "^0.10.3",
|
|
|
|
| 31 |
"@types/uuid": "^9.0.8",
|
| 32 |
"@typescript-eslint/eslint-plugin": "^6.x",
|
| 33 |
"@typescript-eslint/parser": "^6.x",
|
|
@@ -52,9 +53,11 @@
|
|
| 52 |
},
|
| 53 |
"type": "module",
|
| 54 |
"dependencies": {
|
|
|
|
| 55 |
"@huggingface/hub": "^0.5.1",
|
| 56 |
"@huggingface/inference": "^2.6.3",
|
| 57 |
"@iconify-json/bi": "^1.1.21",
|
|
|
|
| 58 |
"@resvg/resvg-js": "^2.6.0",
|
| 59 |
"@xenova/transformers": "^2.16.1",
|
| 60 |
"autoprefixer": "^10.4.14",
|
|
@@ -76,10 +79,12 @@
|
|
| 76 |
"parquetjs": "^0.11.2",
|
| 77 |
"pino": "^9.0.0",
|
| 78 |
"pino-pretty": "^11.0.0",
|
|
|
|
| 79 |
"postcss": "^8.4.31",
|
| 80 |
"saslprep": "^1.0.3",
|
| 81 |
"satori": "^0.10.11",
|
| 82 |
"satori-html": "^0.3.2",
|
|
|
|
| 83 |
"serpapi": "^1.1.1",
|
| 84 |
"sharp": "^0.33.2",
|
| 85 |
"tailwind-scrollbar": "^3.0.0",
|
|
|
|
| 28 |
"@types/jsdom": "^21.1.1",
|
| 29 |
"@types/minimist": "^1.2.5",
|
| 30 |
"@types/parquetjs": "^0.10.3",
|
| 31 |
+
"@types/sbd": "^1.0.5",
|
| 32 |
"@types/uuid": "^9.0.8",
|
| 33 |
"@typescript-eslint/eslint-plugin": "^6.x",
|
| 34 |
"@typescript-eslint/parser": "^6.x",
|
|
|
|
| 53 |
},
|
| 54 |
"type": "module",
|
| 55 |
"dependencies": {
|
| 56 |
+
"@cliqz/adblocker-playwright": "^1.27.2",
|
| 57 |
"@huggingface/hub": "^0.5.1",
|
| 58 |
"@huggingface/inference": "^2.6.3",
|
| 59 |
"@iconify-json/bi": "^1.1.21",
|
| 60 |
+
"@playwright/browser-chromium": "^1.43.1",
|
| 61 |
"@resvg/resvg-js": "^2.6.0",
|
| 62 |
"@xenova/transformers": "^2.16.1",
|
| 63 |
"autoprefixer": "^10.4.14",
|
|
|
|
| 79 |
"parquetjs": "^0.11.2",
|
| 80 |
"pino": "^9.0.0",
|
| 81 |
"pino-pretty": "^11.0.0",
|
| 82 |
+
"playwright": "^1.40.0",
|
| 83 |
"postcss": "^8.4.31",
|
| 84 |
"saslprep": "^1.0.3",
|
| 85 |
"satori": "^0.10.11",
|
| 86 |
"satori-html": "^0.3.2",
|
| 87 |
+
"sbd": "^1.0.19",
|
| 88 |
"serpapi": "^1.1.1",
|
| 89 |
"sharp": "^0.33.2",
|
| 90 |
"tailwind-scrollbar": "^3.0.0",
|
|
@@ -227,7 +227,7 @@
|
|
| 227 |
{#if webSearchSources?.length}
|
| 228 |
<div class="mt-4 flex flex-wrap items-center gap-x-2 gap-y-1.5 text-sm">
|
| 229 |
<div class="text-gray-400">Sources:</div>
|
| 230 |
-
{#each webSearchSources as { link, title
|
| 231 |
<a
|
| 232 |
class="flex items-center gap-2 whitespace-nowrap rounded-lg border bg-white px-2 py-1.5 leading-none hover:border-gray-300 dark:border-gray-800 dark:bg-gray-900 dark:hover:border-gray-700"
|
| 233 |
href={link}
|
|
@@ -235,10 +235,10 @@
|
|
| 235 |
>
|
| 236 |
<img
|
| 237 |
class="h-3.5 w-3.5 rounded"
|
| 238 |
-
src="https://www.google.com/s2/favicons?sz=64&domain_url={hostname}"
|
| 239 |
alt="{title} favicon"
|
| 240 |
/>
|
| 241 |
-
<div>{hostname.replace(/^www\./, "")}</div>
|
| 242 |
</a>
|
| 243 |
{/each}
|
| 244 |
</div>
|
|
|
|
| 227 |
{#if webSearchSources?.length}
|
| 228 |
<div class="mt-4 flex flex-wrap items-center gap-x-2 gap-y-1.5 text-sm">
|
| 229 |
<div class="text-gray-400">Sources:</div>
|
| 230 |
+
{#each webSearchSources as { link, title }}
|
| 231 |
<a
|
| 232 |
class="flex items-center gap-2 whitespace-nowrap rounded-lg border bg-white px-2 py-1.5 leading-none hover:border-gray-300 dark:border-gray-800 dark:bg-gray-900 dark:hover:border-gray-700"
|
| 233 |
href={link}
|
|
|
|
| 235 |
>
|
| 236 |
<img
|
| 237 |
class="h-3.5 w-3.5 rounded"
|
| 238 |
+
src="https://www.google.com/s2/favicons?sz=64&domain_url={new URL(link).hostname}"
|
| 239 |
alt="{title} favicon"
|
| 240 |
/>
|
| 241 |
+
<div>{new URL(link).hostname.replace(/^www\./, "")}</div>
|
| 242 |
</a>
|
| 243 |
{/each}
|
| 244 |
</div>
|
|
@@ -32,7 +32,12 @@ export async function embeddingEndpointHfApi(
|
|
| 32 |
"Content-Type": "application/json",
|
| 33 |
...(authorization ? { Authorization: authorization } : {}),
|
| 34 |
},
|
| 35 |
-
body: JSON.stringify({
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
});
|
| 37 |
|
| 38 |
if (!response.ok) {
|
|
|
|
| 32 |
"Content-Type": "application/json",
|
| 33 |
...(authorization ? { Authorization: authorization } : {}),
|
| 34 |
},
|
| 35 |
+
body: JSON.stringify({
|
| 36 |
+
inputs: {
|
| 37 |
+
source_sentence: batchInputs[0],
|
| 38 |
+
sentences: batchInputs.slice(1),
|
| 39 |
+
},
|
| 40 |
+
}),
|
| 41 |
});
|
| 42 |
|
| 43 |
if (!response.ok) {
|
|
@@ -1,26 +1,38 @@
|
|
| 1 |
import { Address6, Address4 } from "ip-address";
|
| 2 |
-
|
| 3 |
import dns from "node:dns";
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
dns.lookup(
|
| 8 |
-
if (err)
|
| 9 |
-
|
| 10 |
-
}
|
| 11 |
-
if (family === 4) {
|
| 12 |
-
const addr = new Address4(address);
|
| 13 |
-
resolve(addr.isInSubnet(new Address4("127.0.0.0/8")));
|
| 14 |
-
} else if (family === 6) {
|
| 15 |
-
const addr = new Address6(address);
|
| 16 |
-
resolve(
|
| 17 |
-
addr.isLoopback() || addr.isInSubnet(new Address6("::1/128")) || addr.isLinkLocal()
|
| 18 |
-
);
|
| 19 |
-
} else {
|
| 20 |
-
reject(new Error("Unknown IP family"));
|
| 21 |
-
}
|
| 22 |
});
|
| 23 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
}
|
|
|
|
| 1 |
import { Address6, Address4 } from "ip-address";
|
|
|
|
| 2 |
import dns from "node:dns";
|
| 3 |
|
| 4 |
+
const dnsLookup = (hostname: string): Promise<{ address: string; family: number }> => {
|
| 5 |
+
return new Promise((resolve, reject) => {
|
| 6 |
+
dns.lookup(hostname, (err, address, family) => {
|
| 7 |
+
if (err) return reject(err);
|
| 8 |
+
resolve({ address, family });
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
});
|
| 10 |
});
|
| 11 |
+
};
|
| 12 |
+
|
| 13 |
+
export async function isURLLocal(URL: URL): Promise<boolean> {
|
| 14 |
+
const { address, family } = await dnsLookup(URL.hostname);
|
| 15 |
+
|
| 16 |
+
if (family === 4) {
|
| 17 |
+
const addr = new Address4(address);
|
| 18 |
+
const localSubnet = new Address4("127.0.0.0/8");
|
| 19 |
+
return addr.isInSubnet(localSubnet);
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
if (family === 6) {
|
| 23 |
+
const addr = new Address6(address);
|
| 24 |
+
return addr.isLoopback() || addr.isInSubnet(new Address6("::1/128")) || addr.isLinkLocal();
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
throw Error("Unknown IP family");
|
| 28 |
+
}
|
| 29 |
|
| 30 |
+
export function isURLStringLocal(url: string) {
|
| 31 |
+
try {
|
| 32 |
+
const urlObj = new URL(url);
|
| 33 |
+
return isURLLocal(urlObj);
|
| 34 |
+
} catch (e) {
|
| 35 |
+
// assume local if URL parsing fails
|
| 36 |
+
return true;
|
| 37 |
+
}
|
| 38 |
}
|
|
@@ -13,11 +13,9 @@ export async function preprocessMessages(
|
|
| 13 |
return await Promise.all(
|
| 14 |
structuredClone(messages).map(async (message, idx) => {
|
| 15 |
const webSearchContext = webSearch?.contextSources
|
| 16 |
-
.map(({ context }) => context)
|
| 17 |
-
.
|
| 18 |
-
|
| 19 |
-
.map(({ text }) => text)
|
| 20 |
-
.join(" ");
|
| 21 |
// start by adding websearch to the last message
|
| 22 |
if (idx === messages.length - 1 && webSearch && webSearchContext?.trim()) {
|
| 23 |
const lastQuestion = messages.findLast((el) => el.from === "user")?.content ?? "";
|
|
@@ -27,7 +25,7 @@ export async function preprocessMessages(
|
|
| 27 |
.map((el) => el.content);
|
| 28 |
const currentDate = format(new Date(), "MMMM d, yyyy");
|
| 29 |
|
| 30 |
-
message.content = `I searched the web using the query: ${webSearch.searchQuery}.
|
| 31 |
Today is ${currentDate} and here are the results:
|
| 32 |
=====================
|
| 33 |
${webSearchContext}
|
|
|
|
| 13 |
return await Promise.all(
|
| 14 |
structuredClone(messages).map(async (message, idx) => {
|
| 15 |
const webSearchContext = webSearch?.contextSources
|
| 16 |
+
.map(({ context }) => context.trim())
|
| 17 |
+
.join("\n\n----------\n\n");
|
| 18 |
+
|
|
|
|
|
|
|
| 19 |
// start by adding websearch to the last message
|
| 20 |
if (idx === messages.length - 1 && webSearch && webSearchContext?.trim()) {
|
| 21 |
const lastQuestion = messages.findLast((el) => el.from === "user")?.content ?? "";
|
|
|
|
| 25 |
.map((el) => el.content);
|
| 26 |
const currentDate = format(new Date(), "MMMM d, yyyy");
|
| 27 |
|
| 28 |
+
message.content = `I searched the web using the query: ${webSearch.searchQuery}.
|
| 29 |
Today is ${currentDate} and here are the results:
|
| 30 |
=====================
|
| 31 |
${webSearchContext}
|
|
@@ -3,40 +3,31 @@ import type { EmbeddingBackendModel } from "$lib/server/embeddingModels";
|
|
| 3 |
import type { Embedding } from "$lib/server/embeddingEndpoints/embeddingEndpoints";
|
| 4 |
|
| 5 |
// see here: https://github.com/nmslib/hnswlib/blob/359b2ba87358224963986f709e593d799064ace6/README.md?plain=1#L34
|
| 6 |
-
function innerProduct(embeddingA: Embedding, embeddingB: Embedding) {
|
| 7 |
return 1.0 - dot(embeddingA, embeddingB);
|
| 8 |
}
|
| 9 |
|
| 10 |
-
export async function
|
| 11 |
embeddingModel: EmbeddingBackendModel,
|
| 12 |
query: string,
|
| 13 |
-
sentences: string[]
|
| 14 |
-
|
| 15 |
-
): Promise<Embedding> {
|
| 16 |
const inputs = [
|
| 17 |
`${embeddingModel.preQuery}${query}`,
|
| 18 |
...sentences.map((sentence) => `${embeddingModel.prePassage}${sentence}`),
|
| 19 |
];
|
| 20 |
|
| 21 |
const embeddingEndpoint = await embeddingModel.getEndpoint();
|
| 22 |
-
const output = await embeddingEndpoint({ inputs })
|
|
|
|
|
|
|
| 23 |
|
| 24 |
const queryEmbedding: Embedding = output[0];
|
| 25 |
const sentencesEmbeddings: Embedding[] = output.slice(1);
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
};
|
| 33 |
-
}
|
| 34 |
-
);
|
| 35 |
-
|
| 36 |
-
distancesFromQuery.sort((a, b) => {
|
| 37 |
-
return a.distance - b.distance;
|
| 38 |
-
});
|
| 39 |
-
|
| 40 |
-
// Return the indexes of the closest topK sentences
|
| 41 |
-
return distancesFromQuery.slice(0, topK).map((item) => item.index);
|
| 42 |
}
|
|
|
|
| 3 |
import type { Embedding } from "$lib/server/embeddingEndpoints/embeddingEndpoints";
|
| 4 |
|
| 5 |
// see here: https://github.com/nmslib/hnswlib/blob/359b2ba87358224963986f709e593d799064ace6/README.md?plain=1#L34
|
| 6 |
+
export function innerProduct(embeddingA: Embedding, embeddingB: Embedding) {
|
| 7 |
return 1.0 - dot(embeddingA, embeddingB);
|
| 8 |
}
|
| 9 |
|
| 10 |
+
export async function getSentenceSimilarity(
|
| 11 |
embeddingModel: EmbeddingBackendModel,
|
| 12 |
query: string,
|
| 13 |
+
sentences: string[]
|
| 14 |
+
): Promise<{ distance: number; embedding: Embedding; idx: number }[]> {
|
|
|
|
| 15 |
const inputs = [
|
| 16 |
`${embeddingModel.preQuery}${query}`,
|
| 17 |
...sentences.map((sentence) => `${embeddingModel.prePassage}${sentence}`),
|
| 18 |
];
|
| 19 |
|
| 20 |
const embeddingEndpoint = await embeddingModel.getEndpoint();
|
| 21 |
+
const output = await embeddingEndpoint({ inputs }).catch((err) => {
|
| 22 |
+
throw Error("Failed to generate embeddings for sentence similarity", { cause: err });
|
| 23 |
+
});
|
| 24 |
|
| 25 |
const queryEmbedding: Embedding = output[0];
|
| 26 |
const sentencesEmbeddings: Embedding[] = output.slice(1);
|
| 27 |
|
| 28 |
+
return sentencesEmbeddings.map((sentenceEmbedding, idx) => ({
|
| 29 |
+
distance: innerProduct(queryEmbedding, sentenceEmbedding),
|
| 30 |
+
embedding: sentenceEmbedding,
|
| 31 |
+
idx,
|
| 32 |
+
}));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
}
|
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { EmbeddingBackendModel } from "$lib/server/embeddingModels";
|
| 2 |
+
import { getSentenceSimilarity } from "$lib/server/sentenceSimilarity";
|
| 3 |
+
|
| 4 |
+
/**
|
| 5 |
+
* Combines sentences together to reach the maximum character limit of the embedding model
|
| 6 |
+
* Improves performance considerably when using CPU embedding
|
| 7 |
+
*/
|
| 8 |
+
export async function getCombinedSentenceSimilarity(
|
| 9 |
+
embeddingModel: EmbeddingBackendModel,
|
| 10 |
+
query: string,
|
| 11 |
+
sentences: string[]
|
| 12 |
+
): ReturnType<typeof getSentenceSimilarity> {
|
| 13 |
+
const combinedSentences = sentences.reduce<{ text: string; indices: number[] }[]>(
|
| 14 |
+
(acc, sentence, idx) => {
|
| 15 |
+
const lastSentence = acc[acc.length - 1];
|
| 16 |
+
if (!lastSentence) return [{ text: sentence, indices: [idx] }];
|
| 17 |
+
if (lastSentence.text.length + sentence.length < embeddingModel.chunkCharLength) {
|
| 18 |
+
lastSentence.text += ` ${sentence}`;
|
| 19 |
+
lastSentence.indices.push(idx);
|
| 20 |
+
return acc;
|
| 21 |
+
}
|
| 22 |
+
return [...acc, { text: sentence, indices: [idx] }];
|
| 23 |
+
},
|
| 24 |
+
[]
|
| 25 |
+
);
|
| 26 |
+
|
| 27 |
+
const embeddings = await getSentenceSimilarity(
|
| 28 |
+
embeddingModel,
|
| 29 |
+
query,
|
| 30 |
+
combinedSentences.map(({ text }) => text)
|
| 31 |
+
);
|
| 32 |
+
|
| 33 |
+
return embeddings.flatMap((embedding, idx) => {
|
| 34 |
+
const { indices } = combinedSentences[idx];
|
| 35 |
+
return indices.map((i) => ({ ...embedding, idx: i }));
|
| 36 |
+
});
|
| 37 |
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { WebSearchScrapedSource, WebSearchUsedSource } from "$lib/types/WebSearch";
|
| 2 |
+
import type { EmbeddingBackendModel } from "../../embeddingModels";
|
| 3 |
+
import { getSentenceSimilarity, innerProduct } from "../../sentenceSimilarity";
|
| 4 |
+
import { MarkdownElementType, type MarkdownElement } from "../markdown/types";
|
| 5 |
+
import { stringifyMarkdownElement } from "../markdown/utils/stringify";
|
| 6 |
+
import { getCombinedSentenceSimilarity } from "./combine";
|
| 7 |
+
import { flattenTree } from "./tree";
|
| 8 |
+
|
| 9 |
+
const MIN_CHARS = 3_000;
|
| 10 |
+
const SOFT_MAX_CHARS = 8_000;
|
| 11 |
+
|
| 12 |
+
export async function findContextSources(
|
| 13 |
+
sources: WebSearchScrapedSource[],
|
| 14 |
+
prompt: string,
|
| 15 |
+
embeddingModel: EmbeddingBackendModel
|
| 16 |
+
) {
|
| 17 |
+
const sourcesMarkdownElems = sources.map((source) => flattenTree(source.page.markdownTree));
|
| 18 |
+
const markdownElems = sourcesMarkdownElems.flat();
|
| 19 |
+
|
| 20 |
+
// When using CPU embedding (transformersjs), join sentences together to the max character limit
|
| 21 |
+
// to reduce inference time
|
| 22 |
+
const embeddingFunc =
|
| 23 |
+
embeddingModel.endpoints[0].type === "transformersjs"
|
| 24 |
+
? getCombinedSentenceSimilarity
|
| 25 |
+
: getSentenceSimilarity;
|
| 26 |
+
|
| 27 |
+
const embeddings = await embeddingFunc(
|
| 28 |
+
embeddingModel,
|
| 29 |
+
prompt,
|
| 30 |
+
markdownElems
|
| 31 |
+
.map(stringifyMarkdownElement)
|
| 32 |
+
// Safety in case the stringified markdown elements are too long
|
| 33 |
+
// but chunking should have happened earlier
|
| 34 |
+
.map((elem) => elem.slice(0, embeddingModel.chunkCharLength))
|
| 35 |
+
);
|
| 36 |
+
|
| 37 |
+
const topEmbeddings = embeddings
|
| 38 |
+
.sort((a, b) => a.distance - b.distance)
|
| 39 |
+
.filter((embedding) => markdownElems[embedding.idx].type !== MarkdownElementType.Header);
|
| 40 |
+
|
| 41 |
+
let totalChars = 0;
|
| 42 |
+
const selectedMarkdownElems = new Set<MarkdownElement>();
|
| 43 |
+
const selectedEmbeddings: number[][] = [];
|
| 44 |
+
for (const embedding of topEmbeddings) {
|
| 45 |
+
const elem = markdownElems[embedding.idx];
|
| 46 |
+
|
| 47 |
+
// Ignore elements that are too similar to already selected elements
|
| 48 |
+
const tooSimilar = selectedEmbeddings.some(
|
| 49 |
+
(selectedEmbedding) => innerProduct(selectedEmbedding, embedding.embedding) < 0.01
|
| 50 |
+
);
|
| 51 |
+
if (tooSimilar) continue;
|
| 52 |
+
|
| 53 |
+
// Add element
|
| 54 |
+
if (!selectedMarkdownElems.has(elem)) {
|
| 55 |
+
selectedMarkdownElems.add(elem);
|
| 56 |
+
selectedEmbeddings.push(embedding.embedding);
|
| 57 |
+
totalChars += elem.content.length;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
// Add element's parent (header)
|
| 61 |
+
if (elem.parent && !selectedMarkdownElems.has(elem.parent)) {
|
| 62 |
+
selectedMarkdownElems.add(elem.parent);
|
| 63 |
+
totalChars += elem.parent.content.length;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
if (totalChars > SOFT_MAX_CHARS) break;
|
| 67 |
+
if (totalChars > MIN_CHARS && embedding.distance > 0.25) break;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
const contextSources = sourcesMarkdownElems
|
| 71 |
+
.map<WebSearchUsedSource>((elems, idx) => {
|
| 72 |
+
const sourceSelectedElems = elems.filter((elem) => selectedMarkdownElems.has(elem));
|
| 73 |
+
const context = sourceSelectedElems.map(stringifyMarkdownElement).join("\n");
|
| 74 |
+
const source = sources[idx];
|
| 75 |
+
return { ...source, context };
|
| 76 |
+
})
|
| 77 |
+
.filter((contextSource) => contextSource.context.length > 0);
|
| 78 |
+
|
| 79 |
+
return contextSources;
|
| 80 |
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { MarkdownElement } from "../markdown/types";
|
| 2 |
+
|
| 3 |
+
export function flattenTree(elem: MarkdownElement): MarkdownElement[] {
|
| 4 |
+
if ("children" in elem) return [elem, ...elem.children.flatMap(flattenTree)];
|
| 5 |
+
return [elem];
|
| 6 |
+
}
|
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { collapseString, sanitizeString } from "./utils/nlp";
|
| 2 |
+
import { stringifyHTMLElements, stringifyHTMLElementsUnformatted } from "./utils/stringify";
|
| 3 |
+
import { MarkdownElementType, tagNameMap, type HeaderElement, type MarkdownElement } from "./types";
|
| 4 |
+
import type { SerializedHTMLElement } from "../scrape/types";
|
| 5 |
+
|
| 6 |
+
interface ConversionState {
|
| 7 |
+
defaultType:
|
| 8 |
+
| MarkdownElementType.Paragraph
|
| 9 |
+
| MarkdownElementType.BlockQuote
|
| 10 |
+
| MarkdownElementType.UnorderedListItem
|
| 11 |
+
| MarkdownElementType.OrderedListItem;
|
| 12 |
+
listDepth: number;
|
| 13 |
+
blockQuoteDepth: number;
|
| 14 |
+
}
|
| 15 |
+
export function htmlElementToMarkdownElements(
|
| 16 |
+
parent: HeaderElement,
|
| 17 |
+
elem: SerializedHTMLElement | string,
|
| 18 |
+
prevState: ConversionState = {
|
| 19 |
+
defaultType: MarkdownElementType.Paragraph,
|
| 20 |
+
listDepth: 0,
|
| 21 |
+
blockQuoteDepth: 0,
|
| 22 |
+
}
|
| 23 |
+
): MarkdownElement | MarkdownElement[] {
|
| 24 |
+
// Found text so create an element based on the previous state
|
| 25 |
+
if (typeof elem === "string") {
|
| 26 |
+
if (elem.trim().length === 0) return [];
|
| 27 |
+
if (
|
| 28 |
+
prevState.defaultType === MarkdownElementType.UnorderedListItem ||
|
| 29 |
+
prevState.defaultType === MarkdownElementType.OrderedListItem
|
| 30 |
+
) {
|
| 31 |
+
return {
|
| 32 |
+
parent,
|
| 33 |
+
type: prevState.defaultType,
|
| 34 |
+
content: elem,
|
| 35 |
+
depth: prevState.listDepth,
|
| 36 |
+
};
|
| 37 |
+
}
|
| 38 |
+
if (prevState.defaultType === MarkdownElementType.BlockQuote) {
|
| 39 |
+
return {
|
| 40 |
+
parent,
|
| 41 |
+
type: prevState.defaultType,
|
| 42 |
+
content: elem,
|
| 43 |
+
depth: prevState.blockQuoteDepth,
|
| 44 |
+
};
|
| 45 |
+
}
|
| 46 |
+
return { parent, type: prevState.defaultType, content: elem };
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
const type = tagNameMap[elem.tagName] ?? MarkdownElementType.Paragraph;
|
| 50 |
+
|
| 51 |
+
// Update the state based on the current element
|
| 52 |
+
const state: ConversionState = { ...prevState };
|
| 53 |
+
if (type === MarkdownElementType.UnorderedList || type === MarkdownElementType.OrderedList) {
|
| 54 |
+
state.listDepth += 1;
|
| 55 |
+
state.defaultType =
|
| 56 |
+
type === MarkdownElementType.UnorderedList
|
| 57 |
+
? MarkdownElementType.UnorderedListItem
|
| 58 |
+
: MarkdownElementType.OrderedListItem;
|
| 59 |
+
}
|
| 60 |
+
if (type === MarkdownElementType.BlockQuote) {
|
| 61 |
+
state.defaultType = MarkdownElementType.BlockQuote;
|
| 62 |
+
state.blockQuoteDepth += 1;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
// Headers
|
| 66 |
+
if (type === MarkdownElementType.Header) {
|
| 67 |
+
return {
|
| 68 |
+
parent,
|
| 69 |
+
type,
|
| 70 |
+
level: Number(elem.tagName[1]),
|
| 71 |
+
content: collapseString(stringifyHTMLElements(elem.content)),
|
| 72 |
+
children: [],
|
| 73 |
+
};
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
// Code blocks
|
| 77 |
+
if (type === MarkdownElementType.CodeBlock) {
|
| 78 |
+
return {
|
| 79 |
+
parent,
|
| 80 |
+
type,
|
| 81 |
+
content: sanitizeString(stringifyHTMLElementsUnformatted(elem.content)),
|
| 82 |
+
};
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
// Typical case, we want to flatten the DOM and only create elements when we see text
|
| 86 |
+
return elem.content.flatMap((el) => htmlElementToMarkdownElements(parent, el, state));
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
export function mergeAdjacentElements(elements: MarkdownElement[]): MarkdownElement[] {
|
| 90 |
+
return elements.reduce<MarkdownElement[]>((acc, elem) => {
|
| 91 |
+
const last = acc[acc.length - 1];
|
| 92 |
+
if (last && last.type === MarkdownElementType.Paragraph && last.type === elem.type) {
|
| 93 |
+
last.content += elem.content;
|
| 94 |
+
return acc;
|
| 95 |
+
}
|
| 96 |
+
return [...acc, elem];
|
| 97 |
+
}, []);
|
| 98 |
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { SerializedHTMLElement } from "../scrape/types";
|
| 2 |
+
import { htmlElementToMarkdownElements, mergeAdjacentElements } from "./fromHtml";
|
| 3 |
+
import type { HeaderElement, MarkdownElement } from "./types";
|
| 4 |
+
import { MarkdownElementType } from "./types";
|
| 5 |
+
import { chunkElements } from "./utils/chunk";
|
| 6 |
+
|
| 7 |
+
/**
|
| 8 |
+
* Converts HTML elements to Markdown elements and creates a tree based on header tags
|
| 9 |
+
* For example: h1 [h2 [p p blockquote] h2 [h3 [...] ] ]
|
| 10 |
+
**/
|
| 11 |
+
export function htmlToMarkdownTree(
|
| 12 |
+
title: string,
|
| 13 |
+
htmlElements: SerializedHTMLElement[],
|
| 14 |
+
maxCharsPerElem: number
|
| 15 |
+
): HeaderElement {
|
| 16 |
+
let parent: HeaderElement = {
|
| 17 |
+
type: MarkdownElementType.Header,
|
| 18 |
+
level: 1,
|
| 19 |
+
parent: null,
|
| 20 |
+
content: title,
|
| 21 |
+
children: [],
|
| 22 |
+
};
|
| 23 |
+
|
| 24 |
+
const markdownElements = chunkElements(
|
| 25 |
+
mergeAdjacentElements(
|
| 26 |
+
htmlElements.flatMap((elem) => htmlElementToMarkdownElements(parent, elem))
|
| 27 |
+
),
|
| 28 |
+
maxCharsPerElem
|
| 29 |
+
);
|
| 30 |
+
|
| 31 |
+
for (const elem of markdownElements) {
|
| 32 |
+
if (elem.type !== MarkdownElementType.Header) {
|
| 33 |
+
elem.parent = parent;
|
| 34 |
+
parent.children.push(elem);
|
| 35 |
+
continue;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
// add 1 to current level to offset for the title being level 1
|
| 39 |
+
elem.level += 1;
|
| 40 |
+
|
| 41 |
+
// Pop up header levels until reaching the same level as the current header
|
| 42 |
+
// or until we reach the root
|
| 43 |
+
inner: while (parent !== null && parent.parent !== null) {
|
| 44 |
+
if (parent.level < elem.level) break inner;
|
| 45 |
+
parent = parent.parent;
|
| 46 |
+
}
|
| 47 |
+
parent.children.push(elem);
|
| 48 |
+
parent = elem;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
// Pop up to the root
|
| 52 |
+
while (parent.parent !== null) {
|
| 53 |
+
parent = parent.parent;
|
| 54 |
+
}
|
| 55 |
+
return parent;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
export function removeParents<T extends MarkdownElement>(elem: T): T {
|
| 59 |
+
if ("children" in elem) {
|
| 60 |
+
return { ...elem, parent: null, children: elem.children.map((child) => removeParents(child)) };
|
| 61 |
+
}
|
| 62 |
+
return { ...elem, parent: null };
|
| 63 |
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* eslint-disable-next-line no-shadow */
|
| 2 |
+
export enum MarkdownElementType {
|
| 3 |
+
Header = "HEADER",
|
| 4 |
+
Paragraph = "PARAGRAPH",
|
| 5 |
+
BlockQuote = "BLOCKQUOTE",
|
| 6 |
+
CodeBlock = "CODE_BLOCK",
|
| 7 |
+
|
| 8 |
+
UnorderedList = "UNORDERED_LIST",
|
| 9 |
+
OrderedList = "ORDERED_LIST",
|
| 10 |
+
UnorderedListItem = "UNORDERED_LIST_ITEM",
|
| 11 |
+
OrderedListItem = "ORDERED_LIST_ITEM",
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
interface BaseMarkdownElement<T = MarkdownElementType> {
|
| 15 |
+
type: T;
|
| 16 |
+
content: string;
|
| 17 |
+
parent: HeaderElement | null;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
export interface HeaderElement extends BaseMarkdownElement<MarkdownElementType.Header> {
|
| 21 |
+
level: number;
|
| 22 |
+
children: MarkdownElement[];
|
| 23 |
+
}
|
| 24 |
+
type ListItem = MarkdownElementType.UnorderedListItem | MarkdownElementType.OrderedListItem;
|
| 25 |
+
interface ListItemElement extends BaseMarkdownElement<ListItem> {
|
| 26 |
+
depth: number;
|
| 27 |
+
}
|
| 28 |
+
interface BlockQuoteElement extends BaseMarkdownElement<MarkdownElementType.BlockQuote> {
|
| 29 |
+
depth: number;
|
| 30 |
+
}
|
| 31 |
+
interface ParagraphElement extends BaseMarkdownElement<MarkdownElementType.Paragraph> {}
|
| 32 |
+
interface CodeBlockElement extends BaseMarkdownElement<MarkdownElementType.CodeBlock> {}
|
| 33 |
+
|
| 34 |
+
export type MarkdownElement =
|
| 35 |
+
| HeaderElement
|
| 36 |
+
| ParagraphElement
|
| 37 |
+
| BlockQuoteElement
|
| 38 |
+
| CodeBlockElement
|
| 39 |
+
| ListItemElement;
|
| 40 |
+
|
| 41 |
+
export const tagNameMap: Record<string, MarkdownElementType> = {
|
| 42 |
+
h1: MarkdownElementType.Header,
|
| 43 |
+
h2: MarkdownElementType.Header,
|
| 44 |
+
h3: MarkdownElementType.Header,
|
| 45 |
+
h4: MarkdownElementType.Header,
|
| 46 |
+
h5: MarkdownElementType.Header,
|
| 47 |
+
h6: MarkdownElementType.Header,
|
| 48 |
+
div: MarkdownElementType.Paragraph,
|
| 49 |
+
p: MarkdownElementType.Paragraph,
|
| 50 |
+
blockquote: MarkdownElementType.BlockQuote,
|
| 51 |
+
pre: MarkdownElementType.CodeBlock,
|
| 52 |
+
ul: MarkdownElementType.UnorderedList,
|
| 53 |
+
ol: MarkdownElementType.OrderedList,
|
| 54 |
+
li: MarkdownElementType.UnorderedListItem,
|
| 55 |
+
};
|
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { sentences as splitBySentences } from "sbd";
|
| 2 |
+
import { MarkdownElementType, type MarkdownElement } from "../types";
|
| 3 |
+
|
| 4 |
+
export function chunkElements(elements: MarkdownElement[], maxLength: number): MarkdownElement[] {
|
| 5 |
+
return elements.flatMap((elem) => {
|
| 6 |
+
// Can't split headers because it would break the tree, and this situation should be rare
|
| 7 |
+
// so we just cut off the end
|
| 8 |
+
if (elem.type === MarkdownElementType.Header) {
|
| 9 |
+
return { ...elem, content: elem.content.slice(0, maxLength) };
|
| 10 |
+
}
|
| 11 |
+
const contentChunks = enforceMaxLength(elem.content, maxLength);
|
| 12 |
+
return contentChunks.map<MarkdownElement>((content) => ({ ...elem, content }));
|
| 13 |
+
});
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
const delimitersByPriority = ["?", "!", ".", ";", ":", ",", "|", " - ", " ", "-"];
|
| 17 |
+
function enforceMaxLength(text: string, maxLength: number): string[] {
|
| 18 |
+
if (text.length <= maxLength) return [text].filter(Boolean);
|
| 19 |
+
return splitBySentences(text)
|
| 20 |
+
.flatMap((sentence) => {
|
| 21 |
+
if (sentence.length <= maxLength) return sentence;
|
| 22 |
+
|
| 23 |
+
// Discover all necessary split points to fit the sentence within the max length
|
| 24 |
+
const indices: [number, number][] = [];
|
| 25 |
+
while ((indices.at(-1)?.[1] ?? 0) < sentence.length) {
|
| 26 |
+
const prevIndex = indices.at(-1)?.[1] ?? 0;
|
| 27 |
+
|
| 28 |
+
// Remaining text fits within maxLength
|
| 29 |
+
if (prevIndex + maxLength >= sentence.length) {
|
| 30 |
+
indices.push([prevIndex, sentence.length]);
|
| 31 |
+
continue;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
const bestDelimiter = delimitersByPriority.find(
|
| 35 |
+
(delimiter) => sentence.lastIndexOf(delimiter, prevIndex + maxLength) !== -1
|
| 36 |
+
);
|
| 37 |
+
// Fallback in the unusual case that no delimiter is found
|
| 38 |
+
if (!bestDelimiter) {
|
| 39 |
+
indices.push([prevIndex, prevIndex + maxLength]);
|
| 40 |
+
continue;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
const closestDelimiter = sentence.lastIndexOf(bestDelimiter, prevIndex + maxLength);
|
| 44 |
+
indices.push([prevIndex, Math.max(prevIndex + 1, closestDelimiter)]);
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
return indices.map((sliceIndices) => sentence.slice(...sliceIndices));
|
| 48 |
+
})
|
| 49 |
+
.reduce<string[]>(
|
| 50 |
+
(chunks, sentence) => {
|
| 51 |
+
const lastChunk = chunks[chunks.length - 1];
|
| 52 |
+
if (lastChunk.length + sentence.length <= maxLength) {
|
| 53 |
+
return [...chunks.slice(0, -1), lastChunk + sentence];
|
| 54 |
+
}
|
| 55 |
+
return [...chunks, sentence];
|
| 56 |
+
},
|
| 57 |
+
[""]
|
| 58 |
+
)
|
| 59 |
+
.filter(Boolean);
|
| 60 |
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/** Remove excess whitespace and newlines */
|
| 2 |
+
export const sanitizeString = (str: string) =>
|
| 3 |
+
str
|
| 4 |
+
.split("\n")
|
| 5 |
+
.map((s) => s.trim())
|
| 6 |
+
.filter(Boolean)
|
| 7 |
+
.join("\n")
|
| 8 |
+
.replaceAll(/ +/g, " ");
|
| 9 |
+
|
| 10 |
+
/** Collapses a string into a single line */
|
| 11 |
+
export const collapseString = (str: string) => sanitizeString(str.replaceAll(/\n/g, " "));
|
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { SerializedHTMLElement } from "../../scrape/types";
|
| 2 |
+
import { MarkdownElementType, type MarkdownElement } from "../types";
|
| 3 |
+
|
| 4 |
+
// --- Markdown Elements ---
|
| 5 |
+
|
| 6 |
+
/** Converts markdown element to a string with formatting */
|
| 7 |
+
export function stringifyMarkdownElement(elem: MarkdownElement): string {
|
| 8 |
+
const content = elem.content.trim();
|
| 9 |
+
if (elem.type === MarkdownElementType.Header) return `${"#".repeat(elem.level)} ${content}\n\n`;
|
| 10 |
+
if (elem.type === MarkdownElementType.BlockQuote) {
|
| 11 |
+
return `${"> ".repeat(elem.depth)}${content}\n\n`;
|
| 12 |
+
}
|
| 13 |
+
if (elem.type === MarkdownElementType.CodeBlock) return `\`\`\`\n${content}\n\`\`\`\n\n`;
|
| 14 |
+
|
| 15 |
+
if (elem.type === MarkdownElementType.UnorderedListItem) return `- ${content}\n`;
|
| 16 |
+
if (elem.type === MarkdownElementType.OrderedListItem) {
|
| 17 |
+
const siblings = elem.parent?.children ?? [elem];
|
| 18 |
+
const currentIndex = siblings.indexOf(elem);
|
| 19 |
+
const lastAdjacentIndex = siblings
|
| 20 |
+
.slice(currentIndex + 1)
|
| 21 |
+
.findLastIndex((child) => child.type === MarkdownElementType.OrderedListItem);
|
| 22 |
+
const order = currentIndex - lastAdjacentIndex + 1;
|
| 23 |
+
return `${order}. ${content}\n`;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
return `${content}\n\n`;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
// ----- HTML Elements -----
|
| 30 |
+
|
| 31 |
+
/** Ignores all non-inline tag types and grabs their text. Converts inline tags to markdown */
|
| 32 |
+
export function stringifyHTMLElements(elems: (SerializedHTMLElement | string)[]): string {
|
| 33 |
+
return elems.map(stringifyHTMLElement).join("").trim();
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
/** Ignores all non-inline tag types and grabs their text. Converts inline tags to markdown */
|
| 37 |
+
export function stringifyHTMLElement(elem: SerializedHTMLElement | string): string {
|
| 38 |
+
if (typeof elem === "string") return elem;
|
| 39 |
+
if (elem.tagName === "br") return "\n";
|
| 40 |
+
|
| 41 |
+
const content = elem.content.map(stringifyHTMLElement).join("");
|
| 42 |
+
if (content.length === 0) return content;
|
| 43 |
+
|
| 44 |
+
if (elem.tagName === "strong" || elem.tagName === "b") return `**${content}**`;
|
| 45 |
+
if (elem.tagName === "em" || elem.tagName === "i") return `*${content}*`;
|
| 46 |
+
if (elem.tagName === "s" || elem.tagName === "strike") return `~~${content}~~`;
|
| 47 |
+
|
| 48 |
+
if (elem.tagName === "code" || elem.tagName === "var" || elem.tagName === "tt") {
|
| 49 |
+
return `\`${content}\``;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
if (elem.tagName === "sup") return `<sup>${content}</sup>`;
|
| 53 |
+
if (elem.tagName === "sub") return `<sub>${content}</sub>`;
|
| 54 |
+
|
| 55 |
+
if (elem.tagName === "a" && content.trim().length > 0) {
|
| 56 |
+
const href = elem.attributes.href;
|
| 57 |
+
if (!href) return elem.content.map(stringifyHTMLElement).join("");
|
| 58 |
+
return `[${elem.content.map(stringifyHTMLElement).join("")}](${href})`;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
return elem.content.map(stringifyHTMLElement).join("");
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
/** Grabs all text content directly, ignoring HTML tags */
|
| 65 |
+
export function stringifyHTMLElementsUnformatted(
|
| 66 |
+
elems: (SerializedHTMLElement | string)[]
|
| 67 |
+
): string {
|
| 68 |
+
return elems.map(stringifyHTMLElementUnformatted).join("");
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
/** Grabs all text content directly, ignoring HTML tags */
|
| 72 |
+
function stringifyHTMLElementUnformatted(elem: SerializedHTMLElement | string): string {
|
| 73 |
+
if (typeof elem === "string") return elem;
|
| 74 |
+
return elem.content.map(stringifyHTMLElementUnformatted).join("");
|
| 75 |
+
}
|
|
@@ -1,41 +0,0 @@
|
|
| 1 |
-
import { JSDOM, VirtualConsole } from "jsdom";
|
| 2 |
-
|
| 3 |
-
export async function parseWeb(url: string) {
|
| 4 |
-
const abortController = new AbortController();
|
| 5 |
-
setTimeout(() => abortController.abort(), 10000);
|
| 6 |
-
const r = await fetch(url, { signal: abortController.signal, credentials: "omit" }).catch();
|
| 7 |
-
|
| 8 |
-
if (r.headers.get("content-type")?.includes("text/html")) {
|
| 9 |
-
const virtualConsole = new VirtualConsole();
|
| 10 |
-
virtualConsole.on("error", () => {
|
| 11 |
-
// No-op to skip console errors.
|
| 12 |
-
});
|
| 13 |
-
|
| 14 |
-
// put the html string into a DOM
|
| 15 |
-
const dom = new JSDOM((await r.text()) ?? "", {
|
| 16 |
-
virtualConsole,
|
| 17 |
-
});
|
| 18 |
-
|
| 19 |
-
const { document } = dom.window;
|
| 20 |
-
const paragraphs = document.querySelectorAll("p, table, pre, ul, ol");
|
| 21 |
-
|
| 22 |
-
if (!paragraphs.length) {
|
| 23 |
-
throw new Error(`webpage doesn't have any parseable element`);
|
| 24 |
-
}
|
| 25 |
-
const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent);
|
| 26 |
-
|
| 27 |
-
// combine text contents from paragraphs and then remove newlines and multiple spaces
|
| 28 |
-
const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, "");
|
| 29 |
-
|
| 30 |
-
return text;
|
| 31 |
-
} else if (
|
| 32 |
-
r.headers.get("content-type")?.includes("text/plain") ||
|
| 33 |
-
r.headers.get("content-type")?.includes("text/markdown")
|
| 34 |
-
) {
|
| 35 |
-
const text = await r.text();
|
| 36 |
-
// JSON.stringify is needed to turn string concatenation into a single string (ex: "Hello, " + "world!" -> "Hello, world!")
|
| 37 |
-
return JSON.stringify(text);
|
| 38 |
-
} else {
|
| 39 |
-
throw new Error("Unsupported content type");
|
| 40 |
-
}
|
| 41 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,179 +1,103 @@
|
|
| 1 |
-
import { searchWeb } from "$lib/server/websearch/searchWeb";
|
| 2 |
-
import { generateQuery } from "$lib/server/websearch/generateQuery";
|
| 3 |
-
import { parseWeb } from "$lib/server/websearch/parseWeb";
|
| 4 |
-
import { chunk } from "$lib/utils/chunk";
|
| 5 |
-
import { findSimilarSentences } from "$lib/server/sentenceSimilarity";
|
| 6 |
-
import { getWebSearchProvider } from "./searchWeb";
|
| 7 |
import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";
|
| 8 |
-
import { env } from "$env/dynamic/private";
|
| 9 |
|
| 10 |
import type { Conversation } from "$lib/types/Conversation";
|
| 11 |
import type { MessageUpdate } from "$lib/types/MessageUpdate";
|
| 12 |
import type { Message } from "$lib/types/Message";
|
| 13 |
-
import type { WebSearch,
|
| 14 |
import type { Assistant } from "$lib/types/Assistant";
|
| 15 |
|
| 16 |
-
import {
|
| 17 |
-
import
|
| 18 |
-
import {
|
|
|
|
| 19 |
|
| 20 |
-
const
|
| 21 |
-
const
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
| 27 |
|
| 28 |
export async function runWebSearch(
|
| 29 |
conv: Conversation,
|
| 30 |
messages: Message[],
|
| 31 |
updatePad: (upd: MessageUpdate) => void,
|
| 32 |
ragSettings?: Assistant["rag"]
|
| 33 |
-
) {
|
| 34 |
const prompt = messages[messages.length - 1].content;
|
| 35 |
-
const
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
results: [],
|
| 39 |
-
contextSources: [],
|
| 40 |
-
createdAt: new Date(),
|
| 41 |
-
updatedAt: new Date(),
|
| 42 |
-
};
|
| 43 |
-
|
| 44 |
-
function appendUpdate(message: string, args?: string[], type?: "error" | "update") {
|
| 45 |
-
updatePad({ type: "webSearch", messageType: type ?? "update", message, args });
|
| 46 |
-
}
|
| 47 |
|
| 48 |
try {
|
| 49 |
-
// if the assistant specified direct links, skip the websearch
|
| 50 |
-
if (ragSettings && ragSettings?.allowedLinks.length > 0) {
|
| 51 |
-
appendUpdate("Using links specified in Assistant");
|
| 52 |
-
|
| 53 |
-
let linksToUse = [...ragSettings.allowedLinks];
|
| 54 |
-
|
| 55 |
-
if (env.ENABLE_LOCAL_FETCH !== "true") {
|
| 56 |
-
const localLinks = await Promise.all(
|
| 57 |
-
linksToUse.map(async (link) => {
|
| 58 |
-
try {
|
| 59 |
-
const url = new URL(link);
|
| 60 |
-
return await isURLLocal(url);
|
| 61 |
-
} catch (e) {
|
| 62 |
-
return true;
|
| 63 |
-
}
|
| 64 |
-
})
|
| 65 |
-
);
|
| 66 |
-
|
| 67 |
-
linksToUse = linksToUse.filter((_, index) => !localLinks[index]);
|
| 68 |
-
}
|
| 69 |
-
|
| 70 |
-
webSearch.results = linksToUse.map((link) => {
|
| 71 |
-
return { link, hostname: new URL(link).hostname, title: "", text: "" };
|
| 72 |
-
});
|
| 73 |
-
} else {
|
| 74 |
-
webSearch.searchQuery = await generateQuery(messages);
|
| 75 |
-
const searchProvider = getWebSearchProvider();
|
| 76 |
-
appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]);
|
| 77 |
-
|
| 78 |
-
let filters = "";
|
| 79 |
-
if (ragSettings && ragSettings?.allowedDomains.length > 0) {
|
| 80 |
-
appendUpdate("Filtering on specified domains");
|
| 81 |
-
filters += ragSettings.allowedDomains.map((item) => "site:" + item).join(" OR ");
|
| 82 |
-
}
|
| 83 |
-
|
| 84 |
-
// handle the global lists
|
| 85 |
-
filters +=
|
| 86 |
-
allowList.map((item) => "site:" + item).join(" OR ") +
|
| 87 |
-
" " +
|
| 88 |
-
blockList.map((item) => "-site:" + item).join(" ");
|
| 89 |
-
|
| 90 |
-
webSearch.searchQuery = filters + " " + webSearch.searchQuery;
|
| 91 |
-
|
| 92 |
-
const results = await searchWeb(webSearch.searchQuery);
|
| 93 |
-
webSearch.results =
|
| 94 |
-
(results.organic_results &&
|
| 95 |
-
results.organic_results.map((el: { title?: string; link: string; text?: string }) => {
|
| 96 |
-
try {
|
| 97 |
-
const { title, link, text } = el;
|
| 98 |
-
const { hostname } = new URL(link);
|
| 99 |
-
return { title, link, hostname, text };
|
| 100 |
-
} catch (e) {
|
| 101 |
-
// Ignore Errors
|
| 102 |
-
return null;
|
| 103 |
-
}
|
| 104 |
-
})) ??
|
| 105 |
-
[];
|
| 106 |
-
}
|
| 107 |
-
|
| 108 |
-
webSearch.results = webSearch.results.filter((value) => value !== null);
|
| 109 |
-
webSearch.results = webSearch.results
|
| 110 |
-
.filter(({ link }) => !blockList.some((el) => link.includes(el))) // filter out blocklist links
|
| 111 |
-
.slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only
|
| 112 |
-
|
| 113 |
-
// fetch the model
|
| 114 |
const embeddingModel =
|
| 115 |
embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
|
| 116 |
-
|
| 117 |
if (!embeddingModel) {
|
| 118 |
-
throw
|
| 119 |
}
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
paragraphChunks = nestedParagraphChunks.flat();
|
| 142 |
-
if (!paragraphChunks.length) {
|
| 143 |
-
throw new Error("No text found on the first 5 results");
|
| 144 |
-
}
|
| 145 |
-
} else {
|
| 146 |
-
throw new Error("No results found for this search query");
|
| 147 |
}
|
| 148 |
|
|
|
|
| 149 |
appendUpdate("Extracting relevant information");
|
| 150 |
-
const
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
const { source } = paragraphChunks[idx];
|
| 158 |
-
const contextWithId = { idx, text: texts[idx] };
|
| 159 |
-
const usedSource = webSearch.contextSources.find((cSource) => cSource.link === source.link);
|
| 160 |
-
if (usedSource) {
|
| 161 |
-
usedSource.context.push(contextWithId);
|
| 162 |
-
} else {
|
| 163 |
-
webSearch.contextSources.push({ ...source, context: [contextWithId] });
|
| 164 |
-
}
|
| 165 |
-
}
|
| 166 |
updatePad({
|
| 167 |
type: "webSearch",
|
| 168 |
messageType: "sources",
|
| 169 |
message: "sources",
|
| 170 |
-
sources:
|
| 171 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
} catch (searchError) {
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
}
|
| 177 |
-
|
| 178 |
-
return webSearch;
|
| 179 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";
|
|
|
|
| 2 |
|
| 3 |
import type { Conversation } from "$lib/types/Conversation";
|
| 4 |
import type { MessageUpdate } from "$lib/types/MessageUpdate";
|
| 5 |
import type { Message } from "$lib/types/Message";
|
| 6 |
+
import type { WebSearch, WebSearchScrapedSource } from "$lib/types/WebSearch";
|
| 7 |
import type { Assistant } from "$lib/types/Assistant";
|
| 8 |
|
| 9 |
+
import { search } from "./search/search";
|
| 10 |
+
import { scrape } from "./scrape/scrape";
|
| 11 |
+
import { findContextSources } from "./embed/embed";
|
| 12 |
+
import { removeParents } from "./markdown/tree";
|
| 13 |
|
| 14 |
+
const MAX_N_PAGES_TO_SCRAPE = 8 as const;
|
| 15 |
+
const MAX_N_PAGES_TO_EMBED = 5 as const;
|
| 16 |
|
| 17 |
+
export type AppendUpdate = (message: string, args?: string[], type?: "error" | "update") => void;
|
| 18 |
+
const makeAppendUpdate =
|
| 19 |
+
(updatePad: (upd: MessageUpdate) => void): AppendUpdate =>
|
| 20 |
+
(message, args, type) =>
|
| 21 |
+
updatePad({ type: "webSearch", messageType: type ?? "update", message, args });
|
| 22 |
|
| 23 |
export async function runWebSearch(
|
| 24 |
conv: Conversation,
|
| 25 |
messages: Message[],
|
| 26 |
updatePad: (upd: MessageUpdate) => void,
|
| 27 |
ragSettings?: Assistant["rag"]
|
| 28 |
+
): Promise<WebSearch> {
|
| 29 |
const prompt = messages[messages.length - 1].content;
|
| 30 |
+
const createdAt = new Date();
|
| 31 |
+
const updatedAt = new Date();
|
| 32 |
+
const appendUpdate = makeAppendUpdate(updatePad);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
try {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
const embeddingModel =
|
| 36 |
embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
|
|
|
|
| 37 |
if (!embeddingModel) {
|
| 38 |
+
throw Error(`Embedding model ${conv.embeddingModel} not available anymore`);
|
| 39 |
}
|
| 40 |
|
| 41 |
+
// Search the web
|
| 42 |
+
const { searchQuery, pages } = await search(messages, ragSettings, appendUpdate);
|
| 43 |
+
if (pages.length === 0) throw Error("No results found for this search query");
|
| 44 |
+
|
| 45 |
+
// Scrape pages
|
| 46 |
+
appendUpdate("Browsing search results");
|
| 47 |
+
|
| 48 |
+
const scrapedPages = await Promise.all(
|
| 49 |
+
pages
|
| 50 |
+
.slice(0, MAX_N_PAGES_TO_SCRAPE)
|
| 51 |
+
.map(scrape(appendUpdate, embeddingModel.chunkCharLength))
|
| 52 |
+
).then((allScrapedPages) =>
|
| 53 |
+
allScrapedPages
|
| 54 |
+
.filter((p): p is WebSearchScrapedSource => Boolean(p))
|
| 55 |
+
.filter((p) => p.page.markdownTree.children.length > 0)
|
| 56 |
+
.slice(0, MAX_N_PAGES_TO_EMBED)
|
| 57 |
+
);
|
| 58 |
+
|
| 59 |
+
if (!scrapedPages.length) {
|
| 60 |
+
throw Error(`No text found in the first ${MAX_N_PAGES_TO_SCRAPE} results`);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
}
|
| 62 |
|
| 63 |
+
// Chunk the text of each of the elements and find the most similar chunks to the prompt
|
| 64 |
appendUpdate("Extracting relevant information");
|
| 65 |
+
const contextSources = await findContextSources(scrapedPages, prompt, embeddingModel).then(
|
| 66 |
+
(ctxSources) =>
|
| 67 |
+
ctxSources.map((source) => ({
|
| 68 |
+
...source,
|
| 69 |
+
page: { ...source.page, markdownTree: removeParents(source.page.markdownTree) },
|
| 70 |
+
}))
|
| 71 |
+
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
updatePad({
|
| 73 |
type: "webSearch",
|
| 74 |
messageType: "sources",
|
| 75 |
message: "sources",
|
| 76 |
+
sources: contextSources,
|
| 77 |
});
|
| 78 |
+
|
| 79 |
+
return {
|
| 80 |
+
prompt,
|
| 81 |
+
searchQuery,
|
| 82 |
+
results: scrapedPages.map(({ page, ...source }) => ({
|
| 83 |
+
...source,
|
| 84 |
+
page: { ...page, markdownTree: removeParents(page.markdownTree) },
|
| 85 |
+
})),
|
| 86 |
+
contextSources,
|
| 87 |
+
createdAt,
|
| 88 |
+
updatedAt,
|
| 89 |
+
};
|
| 90 |
} catch (searchError) {
|
| 91 |
+
const message = searchError instanceof Error ? searchError.message : String(searchError);
|
| 92 |
+
console.error(message);
|
| 93 |
+
appendUpdate("An error occurred", [JSON.stringify(message)], "error");
|
| 94 |
+
return {
|
| 95 |
+
prompt,
|
| 96 |
+
searchQuery: "",
|
| 97 |
+
results: [],
|
| 98 |
+
contextSources: [],
|
| 99 |
+
createdAt,
|
| 100 |
+
updatedAt,
|
| 101 |
+
};
|
| 102 |
}
|
|
|
|
|
|
|
| 103 |
}
|
|
@@ -0,0 +1,552 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { SerializedHTMLElement } from "./types";
|
| 2 |
+
|
| 3 |
+
interface DBSCANOptions<T> {
|
| 4 |
+
dataset: T[];
|
| 5 |
+
epsilon?: number;
|
| 6 |
+
epsilonCompare?: (distance: number, epsilon: number) => boolean;
|
| 7 |
+
minimumPoints?: number;
|
| 8 |
+
distanceFunction: (a: T, b: T) => number;
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
export function spatialParser() {
|
| 12 |
+
/**
|
| 13 |
+
* Implementation for dbscan, inlined and migrated to typescript from https://github.com/cdxOo/dbscan (MIT License)
|
| 14 |
+
*/
|
| 15 |
+
const DBSCAN = <T>({
|
| 16 |
+
dataset,
|
| 17 |
+
epsilon = 1,
|
| 18 |
+
epsilonCompare = (dist, e) => dist < e,
|
| 19 |
+
minimumPoints = 2,
|
| 20 |
+
distanceFunction,
|
| 21 |
+
}: DBSCANOptions<T>) => {
|
| 22 |
+
const visitedIndices: Record<number, boolean> = {};
|
| 23 |
+
const isVisited = (i: number) => visitedIndices[i];
|
| 24 |
+
const markVisited = (i: number) => {
|
| 25 |
+
visitedIndices[i] = true;
|
| 26 |
+
};
|
| 27 |
+
|
| 28 |
+
const clusteredIndices: Record<number, boolean> = {};
|
| 29 |
+
const isClustered = (i: number) => clusteredIndices[i];
|
| 30 |
+
const markClustered = (i: number) => {
|
| 31 |
+
clusteredIndices[i] = true;
|
| 32 |
+
};
|
| 33 |
+
|
| 34 |
+
const uniqueMerge = <U>(targetArray: U[], sourceArray: U[]) => {
|
| 35 |
+
for (let i = 0; i < sourceArray.length; i += 1) {
|
| 36 |
+
const item = sourceArray[i];
|
| 37 |
+
if (targetArray.indexOf(item) < 0) {
|
| 38 |
+
targetArray.push(item);
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
};
|
| 42 |
+
|
| 43 |
+
const findNeighbors = (index: number) => {
|
| 44 |
+
const neighbors = [];
|
| 45 |
+
for (let other = 0; other < dataset.length; other += 1) {
|
| 46 |
+
const distance = distanceFunction(dataset[index], dataset[other]);
|
| 47 |
+
if (epsilonCompare(distance, epsilon)) {
|
| 48 |
+
neighbors.push(other);
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
return neighbors;
|
| 52 |
+
};
|
| 53 |
+
|
| 54 |
+
const noise: number[] = [];
|
| 55 |
+
const addNoise = (i: number) => noise.push(i);
|
| 56 |
+
|
| 57 |
+
const clusters: number[][] = [];
|
| 58 |
+
const createCluster = () => clusters.push([]) - 1;
|
| 59 |
+
const addIndexToCluster = (c: number, i: number) => {
|
| 60 |
+
clusters[c].push(i);
|
| 61 |
+
markClustered(i);
|
| 62 |
+
};
|
| 63 |
+
|
| 64 |
+
const expandCluster = (c: number, neighbors: number[]) => {
|
| 65 |
+
for (let i = 0; i < neighbors.length; i += 1) {
|
| 66 |
+
const neighborIndex = neighbors[i];
|
| 67 |
+
if (!isVisited(neighborIndex)) {
|
| 68 |
+
markVisited(neighborIndex);
|
| 69 |
+
|
| 70 |
+
const secondaryNeighbors = findNeighbors(neighborIndex);
|
| 71 |
+
if (secondaryNeighbors.length >= minimumPoints) {
|
| 72 |
+
uniqueMerge(neighbors, secondaryNeighbors);
|
| 73 |
+
}
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
if (!isClustered(neighborIndex)) {
|
| 77 |
+
addIndexToCluster(c, neighborIndex);
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
};
|
| 81 |
+
|
| 82 |
+
dataset.forEach((_, index) => {
|
| 83 |
+
if (!isVisited(index)) {
|
| 84 |
+
markVisited(index);
|
| 85 |
+
|
| 86 |
+
const neighbors = findNeighbors(index);
|
| 87 |
+
if (neighbors.length < minimumPoints) {
|
| 88 |
+
addNoise(index);
|
| 89 |
+
} else {
|
| 90 |
+
const clusterIndex = createCluster();
|
| 91 |
+
addIndexToCluster(clusterIndex, index);
|
| 92 |
+
expandCluster(clusterIndex, neighbors);
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
});
|
| 96 |
+
|
| 97 |
+
return { clusters, noise };
|
| 98 |
+
};
|
| 99 |
+
|
| 100 |
+
// -----------
|
| 101 |
+
// Scraping implementation
|
| 102 |
+
|
| 103 |
+
const IgnoredTagsList = [
|
| 104 |
+
"footer",
|
| 105 |
+
"nav",
|
| 106 |
+
"aside",
|
| 107 |
+
"script",
|
| 108 |
+
"style",
|
| 109 |
+
"noscript",
|
| 110 |
+
"form",
|
| 111 |
+
"button",
|
| 112 |
+
];
|
| 113 |
+
const InlineTags = [
|
| 114 |
+
"a",
|
| 115 |
+
"abbrv",
|
| 116 |
+
"span",
|
| 117 |
+
"address",
|
| 118 |
+
"time",
|
| 119 |
+
"acronym",
|
| 120 |
+
"strong",
|
| 121 |
+
"b",
|
| 122 |
+
"br",
|
| 123 |
+
"sub",
|
| 124 |
+
"sup",
|
| 125 |
+
"tt",
|
| 126 |
+
"var",
|
| 127 |
+
"em",
|
| 128 |
+
"i",
|
| 129 |
+
];
|
| 130 |
+
|
| 131 |
+
type ReadableNode = HTMLElement;
|
| 132 |
+
type NodeWithRect = {
|
| 133 |
+
node: ReadableNode;
|
| 134 |
+
rect: DOMRect;
|
| 135 |
+
};
|
| 136 |
+
|
| 137 |
+
const isOnlyChild = (node: Node) => {
|
| 138 |
+
if (!node.parentElement) return true;
|
| 139 |
+
if (node.parentElement.nodeName === "body") return false;
|
| 140 |
+
if (node.parentElement.childNodes.length === 1) return true;
|
| 141 |
+
return false;
|
| 142 |
+
};
|
| 143 |
+
|
| 144 |
+
const hasValidInlineParent = (node: Node) => {
|
| 145 |
+
return node.parentElement && !node.parentElement.matches("div, section, article, main, body ");
|
| 146 |
+
};
|
| 147 |
+
|
| 148 |
+
const hasValidParent = (node: Node) => {
|
| 149 |
+
return node.parentElement && !node.parentElement.isSameNode(document.body);
|
| 150 |
+
};
|
| 151 |
+
|
| 152 |
+
const possibleCodeParents = Array.from(document.querySelectorAll("pre, p"));
|
| 153 |
+
const possibleTableParents = Array.from(document.querySelectorAll("table"));
|
| 154 |
+
const possibleListParents = Array.from(document.querySelectorAll("ul, ol"));
|
| 155 |
+
/**
|
| 156 |
+
* We want to find the highest parent of text node in the cluster.
|
| 157 |
+
* For example in this case: <p><span>Text here</span></p>
|
| 158 |
+
* the P tag is highest parent.
|
| 159 |
+
*/
|
| 160 |
+
const findHighestDirectParentOfReadableNode = (node: Node): HTMLElement => {
|
| 161 |
+
// go up the tree until the parent is no longer an only child
|
| 162 |
+
let parent = node.parentElement;
|
| 163 |
+
// if the parent is an inline tag, then go up one more level
|
| 164 |
+
while (
|
| 165 |
+
parent &&
|
| 166 |
+
hasValidInlineParent(parent) &&
|
| 167 |
+
InlineTags.includes(parent?.tagName.toLowerCase())
|
| 168 |
+
) {
|
| 169 |
+
parent = parent.parentElement;
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
while (parent && isOnlyChild(parent)) {
|
| 173 |
+
if (!hasValidParent(parent)) break;
|
| 174 |
+
parent = parent.parentElement;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
if (!parent) {
|
| 178 |
+
throw new Error(
|
| 179 |
+
"disconnected node found, this should not really be possible when traversing through the dom"
|
| 180 |
+
);
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
// if the parent is a span, code or div tag check if there is a pre tag or p tag above it
|
| 184 |
+
if (["span", "code", "div"].includes(parent.nodeName.toLowerCase())) {
|
| 185 |
+
const hasParent = possibleCodeParents.find((tag) => tag.contains(parent)) as HTMLElement;
|
| 186 |
+
if (hasParent) {
|
| 187 |
+
parent = hasParent;
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
// if the parent is a li tag check if there is a ul or ol tag above it
|
| 192 |
+
if (parent.nodeName.toLowerCase() === "li") {
|
| 193 |
+
const hasParent = possibleListParents.find((tag) => tag.contains(parent)) as HTMLElement;
|
| 194 |
+
if (hasParent) {
|
| 195 |
+
parent = hasParent;
|
| 196 |
+
}
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
// if the parent is a td, th, tr tag check if there is a table tag above it
|
| 200 |
+
if (["td", "th", "tr"].includes(parent.nodeName.toLowerCase())) {
|
| 201 |
+
const hasParent = possibleTableParents.find((tag) => tag.contains(parent)) as HTMLElement;
|
| 202 |
+
if (hasParent) {
|
| 203 |
+
parent = hasParent;
|
| 204 |
+
}
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
return parent;
|
| 208 |
+
};
|
| 209 |
+
const barredNodes = Array.from(document.querySelectorAll(IgnoredTagsList.join(",")));
|
| 210 |
+
|
| 211 |
+
const doesNodePassHeuristics = (node: Node) => {
|
| 212 |
+
if ((node.textContent ?? "").trim().length < 10) {
|
| 213 |
+
return false;
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
const parentNode = findHighestDirectParentOfReadableNode(node);
|
| 217 |
+
|
| 218 |
+
if (parentNode && parentNode instanceof Element) {
|
| 219 |
+
if (
|
| 220 |
+
!parentNode.checkVisibility({
|
| 221 |
+
checkOpacity: true,
|
| 222 |
+
checkVisibilityCSS: true,
|
| 223 |
+
})
|
| 224 |
+
)
|
| 225 |
+
return false;
|
| 226 |
+
|
| 227 |
+
const rect = parentNode.getBoundingClientRect();
|
| 228 |
+
// elements that are readable usually don't have really small height or width
|
| 229 |
+
if (rect.width < 4 || rect.height < 4) {
|
| 230 |
+
return false;
|
| 231 |
+
}
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
if (parentNode && parentNode instanceof Element) {
|
| 235 |
+
if (barredNodes.some((barredNode) => barredNode.contains(parentNode))) {
|
| 236 |
+
return false;
|
| 237 |
+
}
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
return true;
|
| 241 |
+
};
|
| 242 |
+
|
| 243 |
+
const getAllReadableNodes = (): NodeWithRect[] => {
|
| 244 |
+
if (!document.body) throw new Error("Page failed to load");
|
| 245 |
+
const treeWalker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, {
|
| 246 |
+
acceptNode(node) {
|
| 247 |
+
if (doesNodePassHeuristics(node)) {
|
| 248 |
+
return NodeFilter.FILTER_ACCEPT;
|
| 249 |
+
} else {
|
| 250 |
+
return NodeFilter.FILTER_SKIP;
|
| 251 |
+
}
|
| 252 |
+
},
|
| 253 |
+
});
|
| 254 |
+
|
| 255 |
+
const readableNodes = [];
|
| 256 |
+
|
| 257 |
+
while (treeWalker.nextNode()) {
|
| 258 |
+
readableNodes.push(treeWalker.currentNode as ReadableNode);
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
/*
|
| 262 |
+
* <table><p>hello</p><p>world</p></table>
|
| 263 |
+
* table is already included in the parent of the first p tag
|
| 264 |
+
*/
|
| 265 |
+
|
| 266 |
+
const parentsForReadableNodes = readableNodes.map(findHighestDirectParentOfReadableNode);
|
| 267 |
+
const listWithOnlyParents: HTMLElement[] = [];
|
| 268 |
+
// find unique nodes in the parent list, a unique node is a node that is not a child of any other node in the list
|
| 269 |
+
for (let i = 0; i < parentsForReadableNodes.length; i++) {
|
| 270 |
+
const node = parentsForReadableNodes[i];
|
| 271 |
+
const hasParentInList = parentsForReadableNodes.find((otherNode, idx) => {
|
| 272 |
+
if (i === idx) return false;
|
| 273 |
+
return otherNode.contains(node);
|
| 274 |
+
});
|
| 275 |
+
listWithOnlyParents.push(hasParentInList ? hasParentInList : node);
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
const uniqueParents = Array.from(new Set(listWithOnlyParents));
|
| 279 |
+
|
| 280 |
+
return uniqueParents.map((node) => {
|
| 281 |
+
return {
|
| 282 |
+
node,
|
| 283 |
+
rect: node.getBoundingClientRect(),
|
| 284 |
+
};
|
| 285 |
+
});
|
| 286 |
+
};
|
| 287 |
+
|
| 288 |
+
const distanceFunction = (a: NodeWithRect, b: NodeWithRect) => {
|
| 289 |
+
// we make two assumptions here which are fine to make for rects returned from getBoundingClientRect
|
| 290 |
+
// 1. rects are upright and not rotated
|
| 291 |
+
// 2. If two rects intersect, we assume distance to be 0
|
| 292 |
+
let dx = 0;
|
| 293 |
+
let dy = 0;
|
| 294 |
+
const rect1 = a.rect;
|
| 295 |
+
const rect2 = b.rect;
|
| 296 |
+
// Calculate the horizontal distance
|
| 297 |
+
if (rect1.x + rect1.width < rect2.x) {
|
| 298 |
+
dx = rect2.x - (rect1.x + rect1.width);
|
| 299 |
+
} else if (rect2.x + rect2.width < rect1.x) {
|
| 300 |
+
dx = rect1.x - (rect2.x + rect2.width);
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
// Calculate the vertical distance
|
| 304 |
+
if (rect1.y + rect1.height < rect2.y) {
|
| 305 |
+
dy = rect2.y - (rect1.y + rect1.height);
|
| 306 |
+
} else if (rect2.y + rect2.height < rect1.y) {
|
| 307 |
+
dy = rect1.y - (rect2.y + rect2.height);
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
const distance = Math.sqrt(dx * dx + dy * dy);
|
| 311 |
+
// Return the Euclidean distance
|
| 312 |
+
return distance;
|
| 313 |
+
};
|
| 314 |
+
/**
|
| 315 |
+
* Clusters nodes using dbscan
|
| 316 |
+
*/
|
| 317 |
+
const clusterReadableNodes = (nodes: NodeWithRect[]) => {
|
| 318 |
+
const { clusters } = DBSCAN({
|
| 319 |
+
dataset: nodes,
|
| 320 |
+
epsilon: 28,
|
| 321 |
+
minimumPoints: 1,
|
| 322 |
+
distanceFunction,
|
| 323 |
+
});
|
| 324 |
+
|
| 325 |
+
return clusters;
|
| 326 |
+
};
|
| 327 |
+
|
| 328 |
+
const totalTextLength = (cluster: number[]) => {
|
| 329 |
+
return cluster
|
| 330 |
+
.map((t) => readableNodes[t].node.innerText?.replaceAll(/ {2}|\r\n|\n|\r/gm, ""))
|
| 331 |
+
.join("").length;
|
| 332 |
+
};
|
| 333 |
+
|
| 334 |
+
const approximatelyEqual = (a: number, b: number, epsilon = 1) => {
|
| 335 |
+
return Math.abs(a - b) < epsilon;
|
| 336 |
+
};
|
| 337 |
+
|
| 338 |
+
const getClusterBounds = (cluster: number[]) => {
|
| 339 |
+
const leftMostPoint = Math.min(...cluster.map((c) => readableNodes[c].rect.x));
|
| 340 |
+
const topMostPoint = Math.min(...cluster.map((c) => readableNodes[c].rect.y));
|
| 341 |
+
const rightMostPoint = Math.max(
|
| 342 |
+
...cluster.map((c) => readableNodes[c].rect.x + readableNodes[c].rect.width)
|
| 343 |
+
);
|
| 344 |
+
const bottomMostPoint = Math.max(
|
| 345 |
+
...cluster.map((c) => readableNodes[c].rect.y + readableNodes[c].rect.height)
|
| 346 |
+
);
|
| 347 |
+
return {
|
| 348 |
+
// left most element
|
| 349 |
+
x: leftMostPoint,
|
| 350 |
+
y: topMostPoint,
|
| 351 |
+
width: rightMostPoint - leftMostPoint,
|
| 352 |
+
height: bottomMostPoint - topMostPoint,
|
| 353 |
+
};
|
| 354 |
+
};
|
| 355 |
+
|
| 356 |
+
const round = (num: number, decimalPlaces = 2) => {
|
| 357 |
+
const factor = Math.pow(10, decimalPlaces);
|
| 358 |
+
return Math.round(num * factor) / factor;
|
| 359 |
+
};
|
| 360 |
+
|
| 361 |
+
/** minimum distance to center of the screen */
|
| 362 |
+
const clusterCentrality = (cluster: number[]) => {
|
| 363 |
+
const bounds = getClusterBounds(cluster);
|
| 364 |
+
const centerOfScreen = window.innerWidth / 2;
|
| 365 |
+
// the cluster contains the center of the screen
|
| 366 |
+
if (bounds.x < centerOfScreen && bounds.x + bounds.width > centerOfScreen) {
|
| 367 |
+
return 0;
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
// the cluster is to the left of the screen
|
| 371 |
+
if (bounds.x + bounds.width < centerOfScreen) {
|
| 372 |
+
return centerOfScreen - (bounds.x + bounds.width);
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
// the cluster is to the right of the screen
|
| 376 |
+
return bounds.x - centerOfScreen;
|
| 377 |
+
};
|
| 378 |
+
/** measure of text share that belong to the cluster */
|
| 379 |
+
const percentageTextShare = (cluster: number[], totalLength: number) => {
|
| 380 |
+
// apply an exponentially increasing penalty for centrality per 100 pixels distance from center
|
| 381 |
+
|
| 382 |
+
return round((totalTextLength(cluster) / totalLength) * 100);
|
| 383 |
+
};
|
| 384 |
+
|
| 385 |
+
const shouldMergeClusters = (clusterA: number[], clusterB: number[]) => {
|
| 386 |
+
const clusterABounds = getClusterBounds(clusterA);
|
| 387 |
+
const clusterBBounds = getClusterBounds(clusterB);
|
| 388 |
+
|
| 389 |
+
// A cluster is horizontally aligned if the x and width are roughly equal
|
| 390 |
+
const isHorizontallyAligned =
|
| 391 |
+
approximatelyEqual(clusterABounds.x, clusterBBounds.x, 40) &&
|
| 392 |
+
approximatelyEqual(clusterABounds.width, clusterBBounds.width, 40);
|
| 393 |
+
|
| 394 |
+
if (!isHorizontallyAligned) return false;
|
| 395 |
+
|
| 396 |
+
// check the y gap between the clusters
|
| 397 |
+
const higherCluster = clusterABounds.y < clusterBBounds.y ? clusterABounds : clusterBBounds;
|
| 398 |
+
const lowerCluster = clusterABounds.y < clusterBBounds.y ? clusterBBounds : clusterABounds;
|
| 399 |
+
const yGap = lowerCluster.y - (higherCluster.y + higherCluster.height);
|
| 400 |
+
|
| 401 |
+
if (approximatelyEqual(yGap, 0, 100)) return true;
|
| 402 |
+
};
|
| 403 |
+
|
| 404 |
+
const findCriticalClusters = (clusters: number[][]) => {
|
| 405 |
+
// merge the clusters that have similar widths and x position
|
| 406 |
+
|
| 407 |
+
let i = 0;
|
| 408 |
+
while (i < clusters.length) {
|
| 409 |
+
const cluster = clusters[i];
|
| 410 |
+
for (let j = i + 1; j < clusters.length; j++) {
|
| 411 |
+
const otherCluster = clusters[j];
|
| 412 |
+
if (shouldMergeClusters(cluster, otherCluster)) {
|
| 413 |
+
cluster.push(...otherCluster);
|
| 414 |
+
clusters.splice(j, 1);
|
| 415 |
+
j -= 1;
|
| 416 |
+
}
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
i++;
|
| 420 |
+
}
|
| 421 |
+
|
| 422 |
+
const totalText = totalTextLength(clusters.flat());
|
| 423 |
+
|
| 424 |
+
// sort in descending order of text share
|
| 425 |
+
const clusterWithMetrics = clusters.map((cluster) => {
|
| 426 |
+
const centrality = clusterCentrality(cluster);
|
| 427 |
+
return {
|
| 428 |
+
cluster,
|
| 429 |
+
centrality,
|
| 430 |
+
percentageTextShare: percentageTextShare(cluster, totalText),
|
| 431 |
+
};
|
| 432 |
+
});
|
| 433 |
+
|
| 434 |
+
// if there is a dominant cluster with more than 60% text share, return that
|
| 435 |
+
const dominantCluster = clusterWithMetrics[0].percentageTextShare > 60;
|
| 436 |
+
if (dominantCluster) return [clusterWithMetrics[0].cluster];
|
| 437 |
+
|
| 438 |
+
// clusters are sorted by text share after applying a penalty for centrality
|
| 439 |
+
const sortedClusters = clusterWithMetrics.sort((a, b) => {
|
| 440 |
+
const penaltyForA = Math.pow(0.9, a.centrality / 100);
|
| 441 |
+
const penaltyForB = Math.pow(0.9, b.centrality / 100);
|
| 442 |
+
const adjustedTextShareA = a.percentageTextShare * penaltyForA;
|
| 443 |
+
const adjustedTextShareB = b.percentageTextShare * penaltyForB;
|
| 444 |
+
|
| 445 |
+
return adjustedTextShareB - adjustedTextShareA;
|
| 446 |
+
});
|
| 447 |
+
|
| 448 |
+
// find all clusters that are similar to the largest cluster in terms of text share
|
| 449 |
+
// and see if they are enough to cover at least 60% of the text share
|
| 450 |
+
const largeTextShareClusters = sortedClusters.filter((c) =>
|
| 451 |
+
approximatelyEqual(c.percentageTextShare, sortedClusters[0].percentageTextShare, 10)
|
| 452 |
+
);
|
| 453 |
+
|
| 454 |
+
const totalTextShareOfLargeClusters = largeTextShareClusters.reduce(
|
| 455 |
+
(acc, cluster) => acc + cluster.percentageTextShare,
|
| 456 |
+
0
|
| 457 |
+
);
|
| 458 |
+
|
| 459 |
+
if (totalTextShareOfLargeClusters > 60) {
|
| 460 |
+
return largeTextShareClusters.map((c) => c.cluster);
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
// choose clusters till the text share is greater than 60%
|
| 464 |
+
let totalTextShare = 0;
|
| 465 |
+
const criticalClusters = [];
|
| 466 |
+
for (const cluster of sortedClusters) {
|
| 467 |
+
/** Ignore clusters with less than 2%*/
|
| 468 |
+
if (cluster.percentageTextShare < 2) continue;
|
| 469 |
+
if (totalTextShare > 60) break;
|
| 470 |
+
criticalClusters.push(cluster.cluster);
|
| 471 |
+
totalTextShare += cluster.percentageTextShare;
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
// if the total text share is less than 60% then return an empty array
|
| 475 |
+
// as this website should not be particularly useful for the web search anyways
|
| 476 |
+
// this should almost never happen on structured website with a lot of text
|
| 477 |
+
if (totalTextShare < 60) {
|
| 478 |
+
return [];
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
return criticalClusters;
|
| 482 |
+
};
|
| 483 |
+
|
| 484 |
+
const allowListedAttributes = ["href", "src", "alt", "title", "class", "id"];
|
| 485 |
+
function serializeHTMLElement(node: Element): SerializedHTMLElement {
|
| 486 |
+
return {
|
| 487 |
+
tagName: node.tagName.toLowerCase(),
|
| 488 |
+
attributes: allowListedAttributes.reduce((acc, attr) => {
|
| 489 |
+
const value = node.getAttribute(attr);
|
| 490 |
+
if (value) {
|
| 491 |
+
acc[attr] = value;
|
| 492 |
+
}
|
| 493 |
+
return acc;
|
| 494 |
+
}, {} as Record<string, string>),
|
| 495 |
+
content: Array.from(node.childNodes).map(serializeNode).filter(Boolean),
|
| 496 |
+
};
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
function serializeNode(node: Node): SerializedHTMLElement | string {
|
| 500 |
+
if (node.nodeType === 1) return serializeHTMLElement(node as Element);
|
| 501 |
+
else if (node.nodeType === 3) return node.textContent ?? "";
|
| 502 |
+
else return "";
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
function getPageMetadata(): {
|
| 506 |
+
title: string;
|
| 507 |
+
siteName?: string;
|
| 508 |
+
author?: string;
|
| 509 |
+
description?: string;
|
| 510 |
+
createdAt?: string;
|
| 511 |
+
updatedAt?: string;
|
| 512 |
+
} {
|
| 513 |
+
const title = document.title ?? "";
|
| 514 |
+
const siteName =
|
| 515 |
+
document.querySelector("meta[property='og:site_name']")?.getAttribute("content") ?? undefined;
|
| 516 |
+
const author =
|
| 517 |
+
document.querySelector("meta[name='author']")?.getAttribute("content") ?? undefined;
|
| 518 |
+
const description =
|
| 519 |
+
document.querySelector("meta[name='description']")?.getAttribute("content") ??
|
| 520 |
+
document.querySelector("meta[property='og:description']")?.getAttribute("content") ??
|
| 521 |
+
undefined;
|
| 522 |
+
const createdAt =
|
| 523 |
+
document.querySelector("meta[property='article:published_time']")?.getAttribute("content") ??
|
| 524 |
+
document.querySelector("meta[name='date']")?.getAttribute("content") ??
|
| 525 |
+
undefined;
|
| 526 |
+
const updatedAt =
|
| 527 |
+
document.querySelector("meta[property='article:modified_time']")?.getAttribute("content") ??
|
| 528 |
+
undefined;
|
| 529 |
+
|
| 530 |
+
return { title, siteName, author, description, createdAt, updatedAt };
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
const readableNodes = getAllReadableNodes();
|
| 534 |
+
const clusters = clusterReadableNodes(readableNodes);
|
| 535 |
+
|
| 536 |
+
const criticalClusters = findCriticalClusters(clusters);
|
| 537 |
+
|
| 538 |
+
// filter readable nodes using the above information as well as heuristics
|
| 539 |
+
const filteredNodes = readableNodes.filter((_, idx) => {
|
| 540 |
+
return criticalClusters.some((cluster) => {
|
| 541 |
+
return cluster.includes(idx);
|
| 542 |
+
});
|
| 543 |
+
});
|
| 544 |
+
|
| 545 |
+
const elements = filteredNodes
|
| 546 |
+
.filter(
|
| 547 |
+
(node, idx, nodes) => !nodes.slice(idx + 1).some((otherNode) => node.node === otherNode.node)
|
| 548 |
+
)
|
| 549 |
+
.map<SerializedHTMLElement>(({ node }) => serializeHTMLElement(node));
|
| 550 |
+
const metadata = getPageMetadata();
|
| 551 |
+
return { ...metadata, elements };
|
| 552 |
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import {
|
| 2 |
+
type BrowserContext,
|
| 3 |
+
chromium,
|
| 4 |
+
devices,
|
| 5 |
+
type Page,
|
| 6 |
+
type BrowserContextOptions,
|
| 7 |
+
} from "playwright";
|
| 8 |
+
import { PlaywrightBlocker } from "@cliqz/adblocker-playwright";
|
| 9 |
+
import { env } from "$env/dynamic/private";
|
| 10 |
+
|
| 11 |
+
// Singleton initialized by initPlaywrightService
|
| 12 |
+
let playwrightService: Promise<{ ctx: BrowserContext; blocker: PlaywrightBlocker }>;
|
| 13 |
+
|
| 14 |
+
async function initPlaywrightService() {
|
| 15 |
+
if (playwrightService) return playwrightService;
|
| 16 |
+
|
| 17 |
+
const browser = await chromium.launch({ headless: true });
|
| 18 |
+
|
| 19 |
+
process.on("SIGINT", () => browser.close());
|
| 20 |
+
|
| 21 |
+
const device = devices["Desktop Chrome"];
|
| 22 |
+
const options: BrowserContextOptions = {
|
| 23 |
+
...device,
|
| 24 |
+
// Increasing width improves spatial clustering accuracy
|
| 25 |
+
screen: {
|
| 26 |
+
width: 3840,
|
| 27 |
+
height: 1080,
|
| 28 |
+
},
|
| 29 |
+
viewport: {
|
| 30 |
+
width: 3840,
|
| 31 |
+
height: 1080,
|
| 32 |
+
},
|
| 33 |
+
reducedMotion: "reduce",
|
| 34 |
+
acceptDownloads: false,
|
| 35 |
+
timezoneId: "America/New_York",
|
| 36 |
+
locale: "en-US",
|
| 37 |
+
};
|
| 38 |
+
const ctx = await browser.newContext(options);
|
| 39 |
+
const blocker = await PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch).then((blker) => {
|
| 40 |
+
const mostBlocked = blker.blockFonts().blockMedias().blockFrames().blockImages();
|
| 41 |
+
if (env.WEBSEARCH_JAVASCRIPT === "false") return mostBlocked.blockScripts();
|
| 42 |
+
return mostBlocked;
|
| 43 |
+
});
|
| 44 |
+
return Object.freeze({ ctx, blocker });
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
export async function loadPage(url: string): Promise<Page> {
|
| 48 |
+
if (!playwrightService) playwrightService = initPlaywrightService();
|
| 49 |
+
const { ctx, blocker } = await playwrightService;
|
| 50 |
+
|
| 51 |
+
const page = await ctx.newPage();
|
| 52 |
+
await blocker.enableBlockingInPage(page);
|
| 53 |
+
|
| 54 |
+
await page.goto(url, { waitUntil: "load", timeout: 2000 }).catch(() => {
|
| 55 |
+
console.warn(`Failed to load page within 2s: ${url}`);
|
| 56 |
+
});
|
| 57 |
+
|
| 58 |
+
return page;
|
| 59 |
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { AppendUpdate } from "../runWebSearch";
|
| 2 |
+
import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
|
| 3 |
+
import { loadPage } from "./playwright";
|
| 4 |
+
|
| 5 |
+
import { spatialParser } from "./parser";
|
| 6 |
+
import { htmlToMarkdownTree } from "../markdown/tree";
|
| 7 |
+
import { timeout } from "$lib/utils/timeout";
|
| 8 |
+
|
| 9 |
+
export const scrape =
|
| 10 |
+
(appendUpdate: AppendUpdate, maxCharsPerElem: number) =>
|
| 11 |
+
async (source: WebSearchSource): Promise<WebSearchScrapedSource | undefined> => {
|
| 12 |
+
try {
|
| 13 |
+
const page = await scrapeUrl(source.link, maxCharsPerElem);
|
| 14 |
+
appendUpdate("Browsing webpage", [source.link]);
|
| 15 |
+
return { ...source, page };
|
| 16 |
+
} catch (e) {
|
| 17 |
+
const message = e instanceof Error ? e.message : String(e);
|
| 18 |
+
appendUpdate("Failed to parse webpage", [message, source.link], "error");
|
| 19 |
+
}
|
| 20 |
+
};
|
| 21 |
+
|
| 22 |
+
export async function scrapeUrl(url: string, maxCharsPerElem: number) {
|
| 23 |
+
const page = await loadPage(url);
|
| 24 |
+
|
| 25 |
+
return timeout(page.evaluate(spatialParser), 2000)
|
| 26 |
+
.then(({ elements, ...parsed }) => ({
|
| 27 |
+
...parsed,
|
| 28 |
+
markdownTree: htmlToMarkdownTree(parsed.title, elements, maxCharsPerElem),
|
| 29 |
+
}))
|
| 30 |
+
.catch((cause) => {
|
| 31 |
+
throw Error("Parsing failed", { cause });
|
| 32 |
+
})
|
| 33 |
+
.finally(() => page.close());
|
| 34 |
+
}
|
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export interface SerializedHTMLElement {
|
| 2 |
+
tagName: string;
|
| 3 |
+
attributes: Record<string, string>;
|
| 4 |
+
content: (SerializedHTMLElement | string)[];
|
| 5 |
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { WebSearchProvider, type WebSearchSource } from "$lib/types/WebSearch";
|
| 2 |
+
import { env } from "$env/dynamic/private";
|
| 3 |
+
import searchSerper from "./endpoints/serper";
|
| 4 |
+
import searchSerpApi from "./endpoints/serpApi";
|
| 5 |
+
import searchSerpStack from "./endpoints/serpStack";
|
| 6 |
+
import searchYouApi from "./endpoints/youApi";
|
| 7 |
+
import searchWebLocal from "./endpoints/webLocal";
|
| 8 |
+
import searchSearxng from "./endpoints/searxng";
|
| 9 |
+
|
| 10 |
+
export function getWebSearchProvider() {
|
| 11 |
+
if (env.YDC_API_KEY) return WebSearchProvider.YOU;
|
| 12 |
+
if (env.SEARXNG_QUERY_URL) return WebSearchProvider.SEARXNG;
|
| 13 |
+
return WebSearchProvider.GOOGLE;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
/** Searches the web using the first available provider, based on the env */
|
| 17 |
+
export async function searchWeb(query: string): Promise<WebSearchSource[]> {
|
| 18 |
+
if (env.USE_LOCAL_WEBSEARCH) return searchWebLocal(query);
|
| 19 |
+
if (env.SEARXNG_QUERY_URL) return searchSearxng(query);
|
| 20 |
+
if (env.SERPER_API_KEY) return searchSerper(query);
|
| 21 |
+
if (env.YDC_API_KEY) return searchYouApi(query);
|
| 22 |
+
if (env.SERPAPI_KEY) return searchSerpApi(query);
|
| 23 |
+
if (env.SERPSTACK_API_KEY) return searchSerpStack(query);
|
| 24 |
+
throw new Error(
|
| 25 |
+
"No configuration found for web search. Please set USE_LOCAL_WEBSEARCH, SEARXNG_QUERY_URL, SERPER_API_KEY, YDC_API_KEY, or SERPSTACK_API_KEY in your environment variables."
|
| 26 |
+
);
|
| 27 |
+
}
|
|
@@ -1,7 +1,9 @@
|
|
| 1 |
import { env } from "$env/dynamic/private";
|
| 2 |
import { logger } from "$lib/server/logger";
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
export async function searchSearxng(query: string) {
|
| 5 |
const abortController = new AbortController();
|
| 6 |
setTimeout(() => abortController.abort(), 10000);
|
| 7 |
|
|
@@ -20,7 +22,7 @@ export async function searchSearxng(query: string) {
|
|
| 20 |
.then((response) => response.json() as Promise<{ results: { url: string }[] }>)
|
| 21 |
.catch((error) => {
|
| 22 |
logger.error("Failed to fetch or parse JSON", error);
|
| 23 |
-
throw new Error("Failed to fetch or parse JSON");
|
| 24 |
});
|
| 25 |
|
| 26 |
// Extract 'url' elements from the JSON response and trim to the top 5 URLs
|
|
@@ -31,5 +33,5 @@ export async function searchSearxng(query: string) {
|
|
| 31 |
}
|
| 32 |
|
| 33 |
// Map URLs to the correct object shape
|
| 34 |
-
return
|
| 35 |
}
|
|
|
|
| 1 |
import { env } from "$env/dynamic/private";
|
| 2 |
import { logger } from "$lib/server/logger";
|
| 3 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
| 4 |
+
import { isURL } from "$lib/utils/isUrl";
|
| 5 |
|
| 6 |
+
export default async function searchSearxng(query: string): Promise<WebSearchSource[]> {
|
| 7 |
const abortController = new AbortController();
|
| 8 |
setTimeout(() => abortController.abort(), 10000);
|
| 9 |
|
|
|
|
| 22 |
.then((response) => response.json() as Promise<{ results: { url: string }[] }>)
|
| 23 |
.catch((error) => {
|
| 24 |
logger.error("Failed to fetch or parse JSON", error);
|
| 25 |
+
throw new Error("Failed to fetch or parse JSON", { cause: error });
|
| 26 |
});
|
| 27 |
|
| 28 |
// Extract 'url' elements from the JSON response and trim to the top 5 URLs
|
|
|
|
| 33 |
}
|
| 34 |
|
| 35 |
// Map URLs to the correct object shape
|
| 36 |
+
return urls.filter(isURL).map((link) => ({ link }));
|
| 37 |
}
|
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { env } from "$env/dynamic/private";
|
| 2 |
+
import { getJson, type GoogleParameters } from "serpapi";
|
| 3 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
| 4 |
+
import { isURL } from "$lib/utils/isUrl";
|
| 5 |
+
|
| 6 |
+
type SerpApiResponse = {
|
| 7 |
+
organic_results: {
|
| 8 |
+
link: string;
|
| 9 |
+
}[];
|
| 10 |
+
};
|
| 11 |
+
|
| 12 |
+
export default async function searchWebSerpApi(query: string): Promise<WebSearchSource[]> {
|
| 13 |
+
const params = {
|
| 14 |
+
q: query,
|
| 15 |
+
hl: "en",
|
| 16 |
+
gl: "us",
|
| 17 |
+
google_domain: "google.com",
|
| 18 |
+
api_key: env.SERPAPI_KEY,
|
| 19 |
+
} satisfies GoogleParameters;
|
| 20 |
+
|
| 21 |
+
// Show result as JSON
|
| 22 |
+
const response = (await getJson("google", params)) as unknown as SerpApiResponse;
|
| 23 |
+
|
| 24 |
+
return response.organic_results.filter(({ link }) => isURL(link));
|
| 25 |
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { env } from "$env/dynamic/private";
|
| 2 |
+
import { isURL } from "$lib/utils/isUrl";
|
| 3 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
| 4 |
+
|
| 5 |
+
type SerpStackResponse = {
|
| 6 |
+
organic_results: {
|
| 7 |
+
title: string;
|
| 8 |
+
url: string;
|
| 9 |
+
snippet?: string;
|
| 10 |
+
}[];
|
| 11 |
+
error?: string;
|
| 12 |
+
};
|
| 13 |
+
|
| 14 |
+
export default async function searchSerpStack(query: string): Promise<WebSearchSource[]> {
|
| 15 |
+
const response = await fetch(
|
| 16 |
+
`http://api.serpstack.com/search?access_key=${env.SERPSTACK_API_KEY}&query=${query}&hl=en&gl=us`,
|
| 17 |
+
{ headers: { "Content-type": "application/json; charset=UTF-8" } }
|
| 18 |
+
);
|
| 19 |
+
|
| 20 |
+
const data = (await response.json()) as SerpStackResponse;
|
| 21 |
+
|
| 22 |
+
if (!response.ok) {
|
| 23 |
+
throw new Error(
|
| 24 |
+
data.error ?? `SerpStack API returned error code ${response.status} - ${response.statusText}`
|
| 25 |
+
);
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
return data.organic_results
|
| 29 |
+
.filter(({ url }) => isURL(url))
|
| 30 |
+
.map(({ title, url, snippet }) => ({
|
| 31 |
+
title,
|
| 32 |
+
link: url,
|
| 33 |
+
text: snippet ?? "",
|
| 34 |
+
}));
|
| 35 |
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { env } from "$env/dynamic/private";
|
| 2 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
| 3 |
+
|
| 4 |
+
export default async function search(query: string): Promise<WebSearchSource[]> {
|
| 5 |
+
const params = {
|
| 6 |
+
q: query,
|
| 7 |
+
hl: "en",
|
| 8 |
+
gl: "us",
|
| 9 |
+
};
|
| 10 |
+
|
| 11 |
+
const response = await fetch("https://google.serper.dev/search", {
|
| 12 |
+
method: "POST",
|
| 13 |
+
body: JSON.stringify(params),
|
| 14 |
+
headers: {
|
| 15 |
+
"x-api-key": env.SERPER_API_KEY,
|
| 16 |
+
"Content-type": "application/json",
|
| 17 |
+
},
|
| 18 |
+
});
|
| 19 |
+
|
| 20 |
+
/* eslint-disable @typescript-eslint/no-explicit-any */
|
| 21 |
+
const data = (await response.json()) as Record<string, any>;
|
| 22 |
+
|
| 23 |
+
if (!response.ok) {
|
| 24 |
+
throw new Error(
|
| 25 |
+
data["message"] ??
|
| 26 |
+
`Serper API returned error code ${response.status} - ${response.statusText}`
|
| 27 |
+
);
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
return data["organic"] ?? [];
|
| 31 |
+
}
|
|
@@ -1,45 +1,35 @@
|
|
| 1 |
import { JSDOM, VirtualConsole } from "jsdom";
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
export async function searchWebLocal(query: string) {
|
| 4 |
const abortController = new AbortController();
|
| 5 |
setTimeout(() => abortController.abort(), 10000);
|
| 6 |
|
| 7 |
-
const htmlString = await fetch(
|
| 8 |
-
|
| 9 |
-
|
|
|
|
| 10 |
.then((response) => response.text())
|
| 11 |
.catch();
|
| 12 |
|
| 13 |
const virtualConsole = new VirtualConsole();
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
|
| 16 |
-
// No-op to skip console errors.
|
| 17 |
-
});
|
| 18 |
-
|
| 19 |
-
// put the html string into a DOM
|
| 20 |
-
const dom = new JSDOM(htmlString ?? "", {
|
| 21 |
-
virtualConsole,
|
| 22 |
-
});
|
| 23 |
-
|
| 24 |
-
const { document } = dom.window;
|
| 25 |
-
// get all a documents with href tag
|
| 26 |
-
|
| 27 |
const links = document.querySelectorAll("a");
|
| 28 |
-
|
| 29 |
-
if (!links.length) {
|
| 30 |
-
throw new Error(`webpage doesn't have any "a" element`);
|
| 31 |
-
}
|
| 32 |
|
| 33 |
// take url that start wirth /url?q=
|
| 34 |
// and do not contain google.com links
|
| 35 |
// and strip them up to '&sa='
|
| 36 |
const linksHref = Array.from(links)
|
| 37 |
-
.
|
| 38 |
-
.
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
});
|
| 42 |
|
| 43 |
// remove duplicate links and map links to the correct object shape
|
| 44 |
-
return
|
| 45 |
}
|
|
|
|
| 1 |
import { JSDOM, VirtualConsole } from "jsdom";
|
| 2 |
+
import { isURL } from "$lib/utils/isUrl";
|
| 3 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
| 4 |
|
| 5 |
+
export default async function searchWebLocal(query: string): Promise<WebSearchSource[]> {
|
| 6 |
const abortController = new AbortController();
|
| 7 |
setTimeout(() => abortController.abort(), 10000);
|
| 8 |
|
| 9 |
+
const htmlString = await fetch(
|
| 10 |
+
"https://www.google.com/search?hl=en&q=" + encodeURIComponent(query),
|
| 11 |
+
{ signal: abortController.signal }
|
| 12 |
+
)
|
| 13 |
.then((response) => response.text())
|
| 14 |
.catch();
|
| 15 |
|
| 16 |
const virtualConsole = new VirtualConsole();
|
| 17 |
+
virtualConsole.on("error", () => {}); // No-op to skip console errors.
|
| 18 |
+
const document = new JSDOM(htmlString ?? "", { virtualConsole }).window.document;
|
| 19 |
|
| 20 |
+
// get all links
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
const links = document.querySelectorAll("a");
|
| 22 |
+
if (!links.length) throw new Error(`webpage doesn't have any "a" element`);
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
// take url that start wirth /url?q=
|
| 25 |
// and do not contain google.com links
|
| 26 |
// and strip them up to '&sa='
|
| 27 |
const linksHref = Array.from(links)
|
| 28 |
+
.map((el) => el.href)
|
| 29 |
+
.filter((link) => link.startsWith("/url?q=") && !link.includes("google.com/"))
|
| 30 |
+
.map((link) => link.slice("/url?q=".length, link.indexOf("&sa=")))
|
| 31 |
+
.filter(isURL);
|
|
|
|
| 32 |
|
| 33 |
// remove duplicate links and map links to the correct object shape
|
| 34 |
+
return [...new Set(linksHref)].map((link) => ({ link }));
|
| 35 |
}
|
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { env } from "$env/dynamic/private";
|
| 2 |
+
import { isURL } from "$lib/utils/isUrl";
|
| 3 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
| 4 |
+
|
| 5 |
+
interface YouWebSearch {
|
| 6 |
+
hits: YouSearchHit[];
|
| 7 |
+
latency: number;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
interface YouSearchHit {
|
| 11 |
+
url: string;
|
| 12 |
+
title: string;
|
| 13 |
+
description: string;
|
| 14 |
+
snippets: string[];
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
export default async function searchWebYouApi(query: string): Promise<WebSearchSource[]> {
|
| 18 |
+
const response = await fetch(`https://api.ydc-index.io/search?query=${query}`, {
|
| 19 |
+
method: "GET",
|
| 20 |
+
headers: {
|
| 21 |
+
"X-API-Key": env.YDC_API_KEY,
|
| 22 |
+
"Content-type": "application/json; charset=UTF-8",
|
| 23 |
+
},
|
| 24 |
+
});
|
| 25 |
+
|
| 26 |
+
if (!response.ok) {
|
| 27 |
+
throw new Error(`You.com API returned error code ${response.status} - ${response.statusText}`);
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
const data = (await response.json()) as YouWebSearch;
|
| 31 |
+
const formattedResultsWithSnippets = data.hits
|
| 32 |
+
.filter(({ url }) => isURL(url))
|
| 33 |
+
.map(({ title, url, snippets }) => ({
|
| 34 |
+
title,
|
| 35 |
+
link: url,
|
| 36 |
+
text: snippets?.join("\n") || "",
|
| 37 |
+
}))
|
| 38 |
+
.sort((a, b) => b.text.length - a.text.length); // desc order by text length
|
| 39 |
+
|
| 40 |
+
return formattedResultsWithSnippets;
|
| 41 |
+
}
|
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import type { Message } from "$lib/types/Message";
|
| 2 |
import { format } from "date-fns";
|
| 3 |
-
import { generateFromDefaultEndpoint } from "
|
| 4 |
|
| 5 |
export async function generateQuery(messages: Message[]) {
|
| 6 |
const currentDate = format(new Date(), "MMMM d, yyyy");
|
|
|
|
| 1 |
import type { Message } from "$lib/types/Message";
|
| 2 |
import { format } from "date-fns";
|
| 3 |
+
import { generateFromDefaultEndpoint } from "../../generateFromDefaultEndpoint";
|
| 4 |
|
| 5 |
export async function generateQuery(messages: Message[]) {
|
| 6 |
const currentDate = format(new Date(), "MMMM d, yyyy");
|
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { WebSearchSource } from "$lib/types/WebSearch";
|
| 2 |
+
import type { Message } from "$lib/types/Message";
|
| 3 |
+
import type { Assistant } from "$lib/types/Assistant";
|
| 4 |
+
import type { AppendUpdate } from "../runWebSearch";
|
| 5 |
+
import { getWebSearchProvider, searchWeb } from "./endpoints";
|
| 6 |
+
import { generateQuery } from "./generateQuery";
|
| 7 |
+
import { isURLStringLocal } from "$lib/server/isURLLocal";
|
| 8 |
+
import { isURL } from "$lib/utils/isUrl";
|
| 9 |
+
|
| 10 |
+
import z from "zod";
|
| 11 |
+
import JSON5 from "json5";
|
| 12 |
+
import { env } from "$env/dynamic/private";
|
| 13 |
+
|
| 14 |
+
const listSchema = z.array(z.string()).default([]);
|
| 15 |
+
const allowList = listSchema.parse(JSON5.parse(env.WEBSEARCH_ALLOWLIST));
|
| 16 |
+
const blockList = listSchema.parse(JSON5.parse(env.WEBSEARCH_BLOCKLIST));
|
| 17 |
+
|
| 18 |
+
export async function search(
|
| 19 |
+
messages: Message[],
|
| 20 |
+
ragSettings: Assistant["rag"] | undefined,
|
| 21 |
+
appendUpdate: AppendUpdate
|
| 22 |
+
): Promise<{ searchQuery: string; pages: WebSearchSource[] }> {
|
| 23 |
+
if (ragSettings && ragSettings?.allowedLinks.length > 0) {
|
| 24 |
+
appendUpdate("Using links specified in Assistant");
|
| 25 |
+
return {
|
| 26 |
+
searchQuery: "",
|
| 27 |
+
pages: await directLinksToSource(ragSettings.allowedLinks).then(filterByBlockList),
|
| 28 |
+
};
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
const searchQuery = await generateQuery(messages);
|
| 32 |
+
appendUpdate(`Searching ${getWebSearchProvider()}`, [searchQuery]);
|
| 33 |
+
|
| 34 |
+
// handle the global and (optional) rag lists
|
| 35 |
+
if (ragSettings && ragSettings?.allowedDomains.length > 0) {
|
| 36 |
+
appendUpdate("Filtering on specified domains");
|
| 37 |
+
}
|
| 38 |
+
const filters = buildQueryFromSiteFilters(
|
| 39 |
+
[...(ragSettings?.allowedDomains ?? []), ...allowList],
|
| 40 |
+
blockList
|
| 41 |
+
);
|
| 42 |
+
|
| 43 |
+
const searchQueryWithFilters = `${filters} ${searchQuery}`;
|
| 44 |
+
const searchResults = await searchWeb(searchQueryWithFilters).then(filterByBlockList);
|
| 45 |
+
|
| 46 |
+
return {
|
| 47 |
+
searchQuery: searchQueryWithFilters,
|
| 48 |
+
pages: searchResults,
|
| 49 |
+
};
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
// ----------
|
| 53 |
+
// Utils
|
| 54 |
+
function filterByBlockList(results: WebSearchSource[]): WebSearchSource[] {
|
| 55 |
+
return results.filter((result) => !blockList.some((blocked) => result.link.includes(blocked)));
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
function buildQueryFromSiteFilters(allow: string[], block: string[]) {
|
| 59 |
+
return (
|
| 60 |
+
allow.map((item) => "site:" + item).join(" OR ") +
|
| 61 |
+
" " +
|
| 62 |
+
block.map((item) => "-site:" + item).join(" ")
|
| 63 |
+
);
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
async function directLinksToSource(links: string[]): Promise<WebSearchSource[]> {
|
| 67 |
+
if (env.ENABLE_LOCAL_FETCH !== "true") {
|
| 68 |
+
const localLinks = await Promise.all(links.map(isURLStringLocal));
|
| 69 |
+
links = links.filter((_, index) => !localLinks[index]);
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
return links.filter(isURL).map((link) => ({
|
| 73 |
+
link,
|
| 74 |
+
title: "",
|
| 75 |
+
text: [""],
|
| 76 |
+
}));
|
| 77 |
+
}
|
|
@@ -1,148 +0,0 @@
|
|
| 1 |
-
import type { YouWebSearch } from "../../types/WebSearch";
|
| 2 |
-
import { WebSearchProvider } from "../../types/WebSearch";
|
| 3 |
-
import { env } from "$env/dynamic/private";
|
| 4 |
-
import { getJson } from "serpapi";
|
| 5 |
-
import type { GoogleParameters } from "serpapi";
|
| 6 |
-
import { searchWebLocal } from "./searchWebLocal";
|
| 7 |
-
import { searchSearxng } from "./searchSearxng";
|
| 8 |
-
|
| 9 |
-
// get which SERP api is providing web results
|
| 10 |
-
export function getWebSearchProvider() {
|
| 11 |
-
if (env.YDC_API_KEY) {
|
| 12 |
-
return WebSearchProvider.YOU;
|
| 13 |
-
} else if (env.SEARXNG_QUERY_URL) {
|
| 14 |
-
return WebSearchProvider.SEARXNG;
|
| 15 |
-
} else {
|
| 16 |
-
return WebSearchProvider.GOOGLE;
|
| 17 |
-
}
|
| 18 |
-
}
|
| 19 |
-
|
| 20 |
-
// Show result as JSON
|
| 21 |
-
export async function searchWeb(query: string) {
|
| 22 |
-
if (env.USE_LOCAL_WEBSEARCH) {
|
| 23 |
-
return await searchWebLocal(query);
|
| 24 |
-
}
|
| 25 |
-
if (env.SEARXNG_QUERY_URL) {
|
| 26 |
-
return await searchSearxng(query);
|
| 27 |
-
}
|
| 28 |
-
if (env.SERPER_API_KEY) {
|
| 29 |
-
return await searchWebSerper(query);
|
| 30 |
-
}
|
| 31 |
-
if (env.YDC_API_KEY) {
|
| 32 |
-
return await searchWebYouApi(query);
|
| 33 |
-
}
|
| 34 |
-
if (env.SERPAPI_KEY) {
|
| 35 |
-
return await searchWebSerpApi(query);
|
| 36 |
-
}
|
| 37 |
-
if (env.SERPSTACK_API_KEY) {
|
| 38 |
-
return await searchSerpStack(query);
|
| 39 |
-
}
|
| 40 |
-
throw new Error("No You.com or Serper.dev or SerpAPI key found");
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
export async function searchWebSerper(query: string) {
|
| 44 |
-
const params = {
|
| 45 |
-
q: query,
|
| 46 |
-
hl: "en",
|
| 47 |
-
gl: "us",
|
| 48 |
-
};
|
| 49 |
-
|
| 50 |
-
const response = await fetch("https://google.serper.dev/search", {
|
| 51 |
-
method: "POST",
|
| 52 |
-
body: JSON.stringify(params),
|
| 53 |
-
headers: {
|
| 54 |
-
"x-api-key": env.SERPER_API_KEY,
|
| 55 |
-
"Content-type": "application/json; charset=UTF-8",
|
| 56 |
-
},
|
| 57 |
-
});
|
| 58 |
-
|
| 59 |
-
/* eslint-disable @typescript-eslint/no-explicit-any */
|
| 60 |
-
const data = (await response.json()) as Record<string, any>;
|
| 61 |
-
|
| 62 |
-
if (!response.ok) {
|
| 63 |
-
throw new Error(
|
| 64 |
-
data["message"] ??
|
| 65 |
-
`Serper API returned error code ${response.status} - ${response.statusText}`
|
| 66 |
-
);
|
| 67 |
-
}
|
| 68 |
-
|
| 69 |
-
return {
|
| 70 |
-
organic_results: data["organic"] ?? [],
|
| 71 |
-
};
|
| 72 |
-
}
|
| 73 |
-
|
| 74 |
-
export async function searchWebSerpApi(query: string) {
|
| 75 |
-
const params = {
|
| 76 |
-
q: query,
|
| 77 |
-
hl: "en",
|
| 78 |
-
gl: "us",
|
| 79 |
-
google_domain: "google.com",
|
| 80 |
-
api_key: env.SERPAPI_KEY,
|
| 81 |
-
} satisfies GoogleParameters;
|
| 82 |
-
|
| 83 |
-
// Show result as JSON
|
| 84 |
-
const response = await getJson("google", params);
|
| 85 |
-
|
| 86 |
-
return response;
|
| 87 |
-
}
|
| 88 |
-
|
| 89 |
-
export async function searchWebYouApi(query: string) {
|
| 90 |
-
const response = await fetch(`https://api.ydc-index.io/search?query=${query}`, {
|
| 91 |
-
method: "GET",
|
| 92 |
-
headers: {
|
| 93 |
-
"X-API-Key": env.YDC_API_KEY,
|
| 94 |
-
"Content-type": "application/json; charset=UTF-8",
|
| 95 |
-
},
|
| 96 |
-
});
|
| 97 |
-
|
| 98 |
-
if (!response.ok) {
|
| 99 |
-
throw new Error(`You.com API returned error code ${response.status} - ${response.statusText}`);
|
| 100 |
-
}
|
| 101 |
-
|
| 102 |
-
const data = (await response.json()) as YouWebSearch;
|
| 103 |
-
const formattedResultsWithSnippets = data.hits
|
| 104 |
-
.map(({ title, url, snippets }) => ({
|
| 105 |
-
title,
|
| 106 |
-
link: url,
|
| 107 |
-
text: snippets?.join("\n") || "",
|
| 108 |
-
hostname: new URL(url).hostname,
|
| 109 |
-
}))
|
| 110 |
-
.sort((a, b) => b.text.length - a.text.length); // desc order by text length
|
| 111 |
-
|
| 112 |
-
return {
|
| 113 |
-
organic_results: formattedResultsWithSnippets,
|
| 114 |
-
};
|
| 115 |
-
}
|
| 116 |
-
|
| 117 |
-
export async function searchSerpStack(query: string) {
|
| 118 |
-
const response = await fetch(
|
| 119 |
-
`http://api.serpstack.com/search?access_key=${env.SERPSTACK_API_KEY}&query=${query}&hl=en&gl=us`,
|
| 120 |
-
{
|
| 121 |
-
method: "GET",
|
| 122 |
-
headers: {
|
| 123 |
-
"Content-type": "application/json; charset=UTF-8",
|
| 124 |
-
},
|
| 125 |
-
}
|
| 126 |
-
);
|
| 127 |
-
|
| 128 |
-
const data = (await response.json()) as Record<string, any>;
|
| 129 |
-
|
| 130 |
-
if (!response.ok) {
|
| 131 |
-
throw new Error(
|
| 132 |
-
data["error"] ??
|
| 133 |
-
`SerpStack API returned error code ${response.status} - ${response.statusText}`
|
| 134 |
-
);
|
| 135 |
-
}
|
| 136 |
-
|
| 137 |
-
const resultsWithSnippets = data["organic_results"].map(
|
| 138 |
-
({ title, url, snippet }: { title: string; url: string; snippet: string | undefined }) => ({
|
| 139 |
-
title,
|
| 140 |
-
link: url,
|
| 141 |
-
text: snippet || "",
|
| 142 |
-
})
|
| 143 |
-
);
|
| 144 |
-
|
| 145 |
-
return {
|
| 146 |
-
organic_results: resultsWithSnippets ?? [],
|
| 147 |
-
};
|
| 148 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import type { ObjectId } from "mongodb";
|
| 2 |
import type { Conversation } from "./Conversation";
|
| 3 |
import type { Timestamps } from "./Timestamps";
|
|
|
|
| 4 |
|
| 5 |
export interface WebSearch extends Timestamps {
|
| 6 |
_id?: ObjectId;
|
|
@@ -14,14 +15,24 @@ export interface WebSearch extends Timestamps {
|
|
| 14 |
}
|
| 15 |
|
| 16 |
export interface WebSearchSource {
|
| 17 |
-
title
|
| 18 |
link: string;
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
}
|
| 22 |
|
| 23 |
-
export interface WebSearchUsedSource extends
|
| 24 |
-
context:
|
| 25 |
}
|
| 26 |
|
| 27 |
export type WebSearchMessageSources = {
|
|
@@ -29,18 +40,6 @@ export type WebSearchMessageSources = {
|
|
| 29 |
sources: WebSearchSource[];
|
| 30 |
};
|
| 31 |
|
| 32 |
-
export interface YouWebSearch {
|
| 33 |
-
hits: YouSearchHit[];
|
| 34 |
-
latency: number;
|
| 35 |
-
}
|
| 36 |
-
|
| 37 |
-
interface YouSearchHit {
|
| 38 |
-
url: string;
|
| 39 |
-
title: string;
|
| 40 |
-
description: string;
|
| 41 |
-
snippets: string[];
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
// eslint-disable-next-line no-shadow
|
| 45 |
export enum WebSearchProvider {
|
| 46 |
GOOGLE = "Google",
|
|
|
|
| 1 |
import type { ObjectId } from "mongodb";
|
| 2 |
import type { Conversation } from "./Conversation";
|
| 3 |
import type { Timestamps } from "./Timestamps";
|
| 4 |
+
import type { HeaderElement } from "$lib/server/websearch/markdown/types";
|
| 5 |
|
| 6 |
export interface WebSearch extends Timestamps {
|
| 7 |
_id?: ObjectId;
|
|
|
|
| 15 |
}
|
| 16 |
|
| 17 |
export interface WebSearchSource {
|
| 18 |
+
title?: string;
|
| 19 |
link: string;
|
| 20 |
+
}
|
| 21 |
+
export interface WebSearchScrapedSource extends WebSearchSource {
|
| 22 |
+
page: WebSearchPage;
|
| 23 |
+
}
|
| 24 |
+
export interface WebSearchPage {
|
| 25 |
+
title: string;
|
| 26 |
+
siteName?: string;
|
| 27 |
+
author?: string;
|
| 28 |
+
description?: string;
|
| 29 |
+
createdAt?: string;
|
| 30 |
+
modifiedAt?: string;
|
| 31 |
+
markdownTree: HeaderElement;
|
| 32 |
}
|
| 33 |
|
| 34 |
+
export interface WebSearchUsedSource extends WebSearchScrapedSource {
|
| 35 |
+
context: string;
|
| 36 |
}
|
| 37 |
|
| 38 |
export type WebSearchMessageSources = {
|
|
|
|
| 40 |
sources: WebSearchSource[];
|
| 41 |
};
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
// eslint-disable-next-line no-shadow
|
| 44 |
export enum WebSearchProvider {
|
| 45 |
GOOGLE = "Google",
|
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export function isURL(url: string) {
|
| 2 |
+
try {
|
| 3 |
+
new URL(url);
|
| 4 |
+
return true;
|
| 5 |
+
} catch (e) {
|
| 6 |
+
return false;
|
| 7 |
+
}
|
| 8 |
+
}
|
|
@@ -1,6 +1,9 @@
|
|
| 1 |
export const timeout = <T>(prom: Promise<T>, time: number): Promise<T> => {
|
| 2 |
let timer: NodeJS.Timeout;
|
| 3 |
-
return Promise.race([
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
| 6 |
};
|
|
|
|
| 1 |
export const timeout = <T>(prom: Promise<T>, time: number): Promise<T> => {
|
| 2 |
let timer: NodeJS.Timeout;
|
| 3 |
+
return Promise.race([
|
| 4 |
+
prom,
|
| 5 |
+
new Promise<T>((_, reject) => {
|
| 6 |
+
timer = setTimeout(() => reject(new Error(`Timeout after ${time / 1000} seconds`)), time);
|
| 7 |
+
}),
|
| 8 |
+
]).finally(() => clearTimeout(timer));
|
| 9 |
};
|