apify · vdusek · Jun 25, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/.editorconfig b/.editorconfig
@@ -7,6 +7,7 @@ charset = utf-8
 trim_trailing_whitespace = true
 insert_final_newline = true
 end_of_line = lf
+quote_type = single
 
 [Makefile]
 indent_style = tab

diff --git a/.github/workflows/_checks.yaml b/.github/workflows/_checks.yaml
@@ -15,6 +15,9 @@ on:
 permissions:
   contents: read
 
+env:
+  NODE_VERSION: 24
+
 jobs:
   actions_lint_check:
     name: Actions lint check
@@ -46,6 +49,86 @@ jobs:
     with:
       python_versions: '["3.10", "3.11", "3.12", "3.13", "3.14"]'
 
+  markdown_lint_check:
+    name: Markdown lint check
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v7
+
+      - name: Set up Node
+        uses: actions/setup-node@v6
+        with:
+          node-version: ${{ env.NODE_VERSION }}
+
+      - name: Install pnpm and website dependencies
+        uses: apify/actions/pnpm-install@v1.2.0
+        with:
+          working-directory: website
+
+      - name: Lint Markdown
+        run: pnpm lint:md
+        working-directory: website
+
+  website_lint_check:
+    name: Website lint check
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v7
+
+      - name: Set up Node
+        uses: actions/setup-node@v6
+        with:
+          node-version: ${{ env.NODE_VERSION }}
+
+      - name: Install pnpm and website dependencies
+        uses: apify/actions/pnpm-install@v1.2.0
+        with:
+          working-directory: website
+
+      - name: Lint website code
+        run: pnpm lint:code
+        working-directory: website
+
+      - name: Check website formatting
+        run: pnpm format:check
+        working-directory: website
+
+  image_lint_check:
+    name: Image lint check
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v7
+        with:
+          fetch-depth: 0
+
+      # Doc images must be committed as optimized `.webp`. This fails when a PR adds raster
+      # images in another format so they get converted via `pnpm opt:images` first.
+      - name: Get changed unoptimized images
+        id: changed-files
+        uses: tj-actions/changed-files@v47
+        with:
+          files: |
+            docs/**/*.{png,jpg,jpeg,gif,bmp,tif,tiff,avif}
+            website/static/**/*.{png,jpg,jpeg,gif,bmp,tif,tiff,avif}
+          separator: "\n"
+
+      - name: Fail on unoptimized images
+        if: steps.changed-files.outputs.any_changed == 'true'
+        env:
+          UNOPTIMIZED_IMAGE_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
+        run: |
+          echo "Unoptimized images detected! Convert each one to WebP, e.g.:"
+          echo ""
+          while IFS= read -r file_path; do
+            echo "  (cd website && pnpm opt:images \"../$file_path\")"
+          done <<< "$UNOPTIMIZED_IMAGE_FILES"
+          echo ""
+          echo "Then reference the resulting .webp files in your Markdown."
+          exit 1
+
   unit_tests:
     name: Unit tests
     if: inputs.run_tests

diff --git a/.github/workflows/manual_release_docs.yaml b/.github/workflows/manual_release_docs.yaml
@@ -23,7 +23,7 @@ permissions:
   contents: read
 
 env:
-  NODE_VERSION: 22
+  NODE_VERSION: 24
   PYTHON_VERSION: 3.14
 
 jobs:

diff --git a/.github/workflows/manual_version_docs.yaml b/.github/workflows/manual_version_docs.yaml
@@ -27,7 +27,7 @@ permissions:
   contents: read
 
 env:
-  NODE_VERSION: "22"
+  NODE_VERSION: "24"
   PYTHON_VERSION: "3.14"
 
 jobs:

diff --git a/.github/workflows/on_schedule_tests.yaml b/.github/workflows/on_schedule_tests.yaml
@@ -16,7 +16,7 @@ permissions:
   contents: read
 
 env:
-  NODE_VERSION: 22
+  NODE_VERSION: 24
   PYTHON_VERSION: 3.14
   TESTS_CONCURRENCY: 1
 

diff --git a/.markdownlint.yaml b/.markdownlint.yaml
@@ -1,8 +1,30 @@
+# markdownlint config for the docs and top-level Markdown files.
+# Run via `pnpm lint:md` / `pnpm lint:md:fix` from the `website/` directory.
 default: true
-line-length:
-  line_length: 120
-MD007:
-  indent: 4
-MD004:
+
+# Prose is written one sentence per line, so line length is not enforced.
+line-length: false
+
+ul-style:
   style: dash
+
+# Nested unordered lists use 4-space indentation.
+ul-indent:
+  indent: 4
+
+# Docs are MDX and embed JSX components.
 no-inline-html: false
+
+# MDX pages set their title via front matter, so multiple/duplicate H1s are fine.
+single-title: false
+no-duplicate-heading:
+  siblings_only: true
+
+# Anchor links into other pages can't be validated locally.
+link-fragments: false
+
+no-bare-urls: false
+no-trailing-punctuation:
+  punctuation: ".,;:。，；:"
+no-multiple-blanks:
+  maximum: 2
diff --git a/.rules.md b/.rules.md
@@ -52,18 +52,18 @@ Note: `uv run poe unit-tests` first runs tests marked `@pytest.mark.run_alone` i
 - **Type checker**: ty (Astral's type checker), target Python 3.10
 - **Async mode**: pytest-asyncio in `auto` mode (no need for `@pytest.mark.asyncio`)
 - **Commits**: [Conventional Commits](https://www.conventionalcommits.org/) format. Choose the type based on *what* changed, not just *why*:
-  - `feat:` / `fix:` / `perf:` / `refactor:` / `style:` — **source code only**; these trigger a release and appear in the changelog
-  - `test:` — test additions or changes (no release triggered)
-  - `docs:` — documentation changes; also triggers a doc release on master
-  - `ci:` — CI/workflow changes
-  - `chore:` — dependency bumps, tooling, and other housekeeping
-  - `build:` — build system changes
+    - `feat:` / `fix:` / `perf:` / `refactor:` / `style:` — **source code only**; these trigger a release and appear in the changelog
+    - `test:` — test additions or changes (no release triggered)
+    - `docs:` — documentation changes; also triggers a doc release on master
+    - `ci:` — CI/workflow changes
+    - `chore:` — dependency bumps, tooling, and other housekeeping
+    - `build:` — build system changes
 
 ## Architecture
 
 ### Crawler Hierarchy
 
-```
+```text
 BasicCrawler[TCrawlingContext, TStatisticsState]
 ├── AbstractHttpCrawler  →  HttpCrawler, BeautifulSoupCrawler, ParselCrawler
 ├── PlaywrightCrawler
@@ -78,7 +78,7 @@ BasicCrawler[TCrawlingContext, TStatisticsState]
 
 Contexts are progressively enhanced through `ContextPipeline` middleware:
 
-```
+```text
 BasicCrawlingContext → HttpCrawlingContext → ParsedHttpCrawlingContext → BeautifulSoupCrawlingContext
 ```
 
@@ -87,6 +87,7 @@ Each middleware is an async generator that wraps the next handler, enabling setu
 ### Storage Layer
 
 Three-tier design:
+
 - **High-level**: `Dataset`, `KeyValueStore`, `RequestQueue` in `src/crawlee/storages/`
 - **Storage clients** (`src/crawlee/storage_clients/`): `FileSystemStorageClient` (default), `MemoryStorageClient`, `SqlStorageClient`, `RedisStorageClient`
 - **Instance caching**: `StorageInstanceManager` is a global singleton that caches storage instances by ID/name
@@ -98,6 +99,7 @@ Three-tier design:
 ### HTTP Clients
 
 Pluggable via `HttpClient` interface in `src/crawlee/http_clients/`:
+
 - `ImpitHttpClient` (default), `HttpxHttpClient`, `CurlImpersonateHttpClient`
 - Each provides `crawl()` (for crawler pipeline) and `send_request()` (for in-handler use)
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -115,6 +115,22 @@ To run the documentation locally, ensure you have `Node.js` 20+ installed, then
 uv run poe run-docs
 ```
 
+### Linting the docs and website
+
+Markdown content (this guide, `README.md`, and the `docs/` folder) is checked with
+[markdownlint](https://github.com/DavidAnson/markdownlint). The Docusaurus website code is linted
+with [oxlint](https://oxc.rs/) and formatted with [oxfmt](https://oxc.rs/). All of them run in CI.
+To run them locally (requires Node.js 22.12 or newer and pnpm), from the `website/` directory:
+
+```sh
+pnpm lint          # lint Markdown and website code
+pnpm lint:fix      # auto-fix both
+pnpm format        # format the website code
+```
+
+Doc images are committed as optimized `.webp`. To convert a new image, run
+`pnpm opt:images <path-to-image>` from the `website/` directory.
+
 ## Commits
 
 We use [Conventional Commits](https://www.conventionalcommits.org/) format for commit messages. This convention is used to automatically determine version bumps during the release process.
@@ -146,25 +162,22 @@ Publishing new versions to [PyPI](https://pypi.org/project/crawlee) is automated
 
 1. **Do not do this unless absolutely necessary.** In all conceivable scenarios, you should use the `release` workflow instead.
 2. **Make sure you know what you're doing.**
+3. Update the version number by modifying the `version` field under `project` in `pyproject.toml`:
 
-3. Update the version number:
-
-- Modify the `version` field under `project` in `pyproject.toml`.
-
-```toml
-[project]
-name = "crawlee"
-version = "x.z.y"
-```
+    ```toml
+    [project]
+    name = "crawlee"
+    version = "x.z.y"
+    ```
 
 4. Build the package:
 
-```sh
-uv run poe build
-```
+    ```sh
+    uv run poe build
+    ```
 
 5. Upload to PyPI:
 
-```sh
-uv publish --token YOUR_API_TOKEN
-```
+    ```sh
+    uv publish --token YOUR_API_TOKEN
+    ```
diff --git a/docs/deployment/apify_platform.mdx b/docs/deployment/apify_platform.mdx
@@ -13,7 +13,7 @@ import CrawlerAsActorExample from '!!raw-loader!./code_examples/apify/crawler_as
 import ProxyExample from '!!raw-loader!./code_examples/apify/proxy_example.py';
 import ProxyAdvancedExample from '!!raw-loader!./code_examples/apify/proxy_advanced_example.py';
 
-Apify is a [platform](https://apify.com) built to serve large-scale and high-performance web scraping and automation needs. It provides easy access to [compute instances (Actors)](#what-is-an-actor), convenient request and result storages, [proxies](../guides/proxy-management), scheduling, webhooks and [more](https://docs.apify.com/), accessible through a [web interface](https://console.apify.com) or an [API](https://docs.apify.com/api).
+Apify is a [platform](https://apify.com) built to serve large-scale and high-performance web scraping and automation needs. It provides easy access to [compute instances (Actors)](#what-is-an-actor), convenient request and result storages, [proxies](../guides/proxy-management), scheduling, webhooks, and [more in the Apify documentation](https://docs.apify.com/), accessible through a [web interface](https://console.apify.com) or an [API](https://docs.apify.com/api).
 
 While we think that the Apify platform is super cool, and it's definitely worth signing up for a [free account](https://console.apify.com/sign-up), **Crawlee is and will always be open source**, runnable locally or on any cloud infrastructure.
 
@@ -25,7 +25,7 @@ We do not test Crawlee in other cloud environments such as Lambda or on specific
 
 ## Requirements
 
-To run your Crawlee code on Apify platform, you need an Apify account. If you don't have one yet, you can sign up [here](https://console.apify.com/sign-up).
+To run your Crawlee code on Apify platform, you need an Apify account. If you don't have one yet, you can [sign up](https://console.apify.com/sign-up).
 
 Additionally, you must have the [Apify CLI](https://docs.apify.com/cli/) installed on your computer. For installation instructions, refer to the [Installation guide](https://docs.apify.com/cli/docs/installation).
 
@@ -72,7 +72,7 @@ When you deploy your script to the Apify platform, it becomes an [Actor](https:/
 
 Actors can be shared in the [Apify Store](https://apify.com/store) so that other people can use them. But don't worry, if you share your Actor in the store and somebody uses it, it runs under their account, not yours.
 
-**Related links**
+### Related links
 
 - [Store of existing Actors](https://apify.com/store)
 - [Documentation](https://docs.apify.com/actors)
@@ -135,7 +135,8 @@ There are several things worth mentioning here.
 
 ### Helper functions for default Key-Value Store and Dataset
 
-To simplify access to the _default_ storages, instead of using the helper functions of respective storage classes, you could use:
+To simplify access to the *default* storages, instead of using the helper functions of respective storage classes, you could use:
+
 - [`Actor.set_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#set_value), [`Actor.get_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_value), [`Actor.get_input()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_input) for [`Key-Value Store`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore)
 - [`Actor.push_data()`](https://docs.apify.com/sdk/python/reference/class/Actor#push_data) for [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset)
 
@@ -150,6 +151,7 @@ If you don't plan to force usage of the platform storages when running the Actor
 :::
 
 {/*
+
 ### Getting public url of an item in the platform storage
 
 If you need to share a link to some file stored in a [Key-Value](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) Store on Apify platform, you can use [`get_public_url()`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore#get_public_url) method. It accepts only one parameter: `key` - the key of the item you want to share.
@@ -164,7 +166,7 @@ If you need to share a link to some file stored in a [Key-Value](https://docs.ap
 
 When the <ApiLink to="class/Dataset">`Dataset`</ApiLink> is stored on the [Apify platform](https://apify.com/actors), you can export its data to the following formats: HTML, JSON, CSV, Excel, XML and RSS. The datasets are displayed on the Actor run details page and in the [Storage](https://console.apify.com/storage) section in the Apify Console. The actual data is exported using the [Get dataset items](https://apify.com/docs/api/v2#/reference/datasets/item-collection/get-items) Apify API endpoint. This way you can easily share the crawling results.
 
-**Related links**
+### Related links
 
 - [Apify platform storage documentation](https://docs.apify.com/storage)
 - [View storage in Apify Console](https://console.apify.com/storage)
@@ -245,9 +247,10 @@ in the [proxy dashboard](https://console.apify.com/proxy).
 The [`ProxyConfiguration`](https://docs.apify.com/sdk/python/reference/class/ProxyConfiguration) class covers both Apify Proxy and custom proxy URLs so that you can easily switch between proxy providers. However, some features of the class are available only to Apify Proxy users, mainly because Apify Proxy is what one would call a super-proxy. It's not a single proxy server, but an API endpoint that allows connection through millions of different IP addresses. So the class essentially has two modes: Apify Proxy or Own (third party) proxy.
 
 The difference is easy to remember.
+
 - If you're using your own proxies - you should create a <ApiLink to="class/ProxyConfiguration">`ProxyConfiguration`</ApiLink> instance directly.
 - If you are planning to use Apify Proxy - you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function. The `new_url_function` parameter enables the use of your custom proxy URLs, whereas all the other options are there to configure Apify Proxy.
 
-**Related links**
+### Related links
 
 - [Apify Proxy docs](https://docs.apify.com/proxy)
diff --git a/docs/deployment/aws_lambda.mdx b/docs/deployment/aws_lambda.mdx
@@ -160,6 +160,7 @@ Create a repository `lambda/aws-playwright` in [Amazon Elastic Container Registr
 Navigate to the created repository and click the "View push commands" button. This will open a window with console commands for uploading the Docker image to your repository. Execute them.
 
 Example:
+
 ```bash
 aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin {user-specific-data}
 docker build --platform linux/amd64 --provenance=false -t lambda/aws-playwright .

diff --git a/docs/guides/architecture_overview.mdx b/docs/guides/architecture_overview.mdx
@@ -164,6 +164,7 @@ StagehandPostNavCrawlingContext --|> StagehandCrawlingContext
 ```
 
 They have a similar inheritance structure as the crawlers, with the base class being <ApiLink to="class/BasicCrawlingContext">`BasicCrawlingContext`</ApiLink>. The specific crawling contexts are:
+
 - <ApiLink to="class/HttpCrawlingContext">`HttpCrawlingContext`</ApiLink> for HTTP crawlers.
 - <ApiLink to="class/ParsedHttpCrawlingContext">`ParsedHttpCrawlingContext`</ApiLink> for HTTP crawlers with parsed responses.
 - <ApiLink to="class/ParselCrawlingContext">`ParselCrawlingContext`</ApiLink> for HTTP crawlers that use [Parsel](https://github.com/scrapy/parsel) for parsing.
@@ -286,6 +287,7 @@ The request <ApiLink to="class/Router">`Router`</ApiLink> is a central component
 Request handlers are user-defined functions that process requests and responses in Crawlee. They are the core of the crawling logic and are responsible for handling data extraction, processing, and storage. Each request handler receives a crawling context as an argument, which provides access to request data, response data, and other information related to the request. Request handlers can be registered with the <ApiLink to="class/Router">`Router`</ApiLink>.
 
 The request routing in Crawlee supports:
+
 - Default handlers - Fallback handlers for requests without specific labels.
 - Label-based routing - Handlers for specific request types based on labels.
 - Error handlers - Handle errors during request processing.

diff --git a/docs/guides/avoid_blocking.mdx b/docs/guides/avoid_blocking.mdx
@@ -50,7 +50,7 @@ For sites with aggressive anti-bot protection, [CloakBrowser](https://github.com
     {PlaywrightWithCloakBrowser}
 </RunnableCodeBlock>
 
-**Related links**
+### Related links
 
 - [Fingerprint Suite Docs](https://github.com/apify/fingerprint-suite)
 - [Apify Academy anti-scraping course](https://docs.apify.com/academy/anti-scraping)
diff --git a/docs/guides/crawler_login.mdx b/docs/guides/crawler_login.mdx
@@ -17,6 +17,7 @@ Many websites require authentication to access their content. This guide demonst
 When implementing authentication, you'll typically want to maintain the same <ApiLink to="class/Session">`Session`</ApiLink> throughout your crawl to preserve login state. This requires proper configuration of the <ApiLink to="class/SessionPool">`SessionPool`</ApiLink>. For more details, see our [session management guide](./session-management).
 
 If your use case requires multiple authenticated sessions with different credentials, you can:
+
 - Use the `new_session_function` parameter in <ApiLink to="class/SessionPool#__init__">`SessionPool`</ApiLink> to customize session creation.
 - Specify the `session_id` parameter in <ApiLink to="class/Request#from_url">`Request`</ApiLink> to bind specific requests to particular sessions.