2.0.0

2026-06-19 07:35:19 +00:00 · 2026-05-02 01:46:00 +05:30
parent 2ee35b0c17
commit 6c5fa8a003
10 changed files with 934 additions and 1045 deletions
@@ -10,6 +10,7 @@ from pathlib import Path
 URL = "https://addons.mozilla.org/firefox/downloads/latest/wappalyzer/platform:2/wappalyzer.xpi"
 REPO_ROOT = Path(__file__).resolve().parent.parent
 DATA_DIR = REPO_ROOT / "wappalyzer" / "data"
+EXTENSION_ARCHIVE = DATA_DIR / "wappalyzer-extension.zip"

 PROMPT_BLOCK = re.compile(
    r"^[ \t]*const current = await get(?:Cached)?Option\('version'\)\n"
@@ -31,6 +32,93 @@ def patch_index_js(content):
    return content


+def patch_manifest_for_chromium(manifest):
+    manifest = json.loads(json.dumps(manifest))
+    manifest.pop("browser_specific_settings", None)
+    manifest.pop("applications", None)
+
+    background = manifest.get("background")
+
+    if isinstance(background, dict) and background.get("service_worker"):
+        background.pop("scripts", None)
+
+    return manifest
+
+
+def validate_manifest(manifest):
+    errors = []
+
+    if manifest.get("manifest_version") != 3:
+        errors.append("manifest_version must be 3")
+
+    if manifest.get("action", {}).get("default_popup") != "html/popup.html":
+        errors.append("action.default_popup must be html/popup.html")
+
+    if not manifest.get("background", {}).get("service_worker"):
+        errors.append("background.service_worker is required")
+
+    if "scripts" in manifest.get("background", {}):
+        errors.append("background.scripts must be removed for Chromium MV3")
+
+    permissions = set(manifest.get("permissions", []))
+    host_permissions = set(manifest.get("host_permissions", []))
+
+    for permission in ("cookies", "storage", "tabs", "webRequest"):
+        if permission not in permissions:
+            errors.append(f"missing permission: {permission}")
+
+    for host_permission in ("http://*/*", "https://*/*"):
+        if host_permission not in host_permissions:
+            errors.append(f"missing host permission: {host_permission}")
+
+    if "browser_specific_settings" in manifest:
+        errors.append("browser_specific_settings must be removed")
+
+    if "applications" in manifest:
+        errors.append("applications must be removed")
+
+    if errors:
+        raise RuntimeError("Invalid Chromium extension manifest: " + "; ".join(errors))
+
+
+def validate_extension_tree(extension_dir, require_technologies=False):
+    required_files = (
+        "manifest.json",
+        "html/popup.html",
+        "js/background.js",
+        "js/index.js",
+        "js/content.js",
+    )
+
+    for relative_path in required_files:
+        if not (extension_dir / relative_path).is_file():
+            raise RuntimeError(f"Missing extension file: {relative_path}")
+
+    if require_technologies and not any((extension_dir / "technologies").glob("*.json")):
+        raise RuntimeError("Missing extension technology fingerprint files")
+
+    validate_manifest(json.loads((extension_dir / "manifest.json").read_text(encoding="utf-8")))
+
+
+def write_chromium_extension_archive(extension_dir, archive_path):
+    manifest_path = extension_dir / "manifest.json"
+    manifest_path.write_text(
+        json.dumps(
+            patch_manifest_for_chromium(
+                json.loads(manifest_path.read_text(encoding="utf-8"))
+            ),
+            separators=(",", ":"),
+        ),
+        encoding="utf-8",
+    )
+    validate_extension_tree(extension_dir, require_technologies=True)
+
+    with zipfile.ZipFile(archive_path, "w", zipfile.ZIP_DEFLATED) as archive:
+        for path in sorted(extension_dir.rglob("*")):
+            if path.is_file():
+                archive.write(path, path.relative_to(extension_dir))
+
+
 DATA_DIR.mkdir(parents=True, exist_ok=True)

 with tempfile.TemporaryDirectory(prefix="wappalyzer-update-") as tempdir:
@@ -60,7 +148,4 @@ with tempfile.TemporaryDirectory(prefix="wappalyzer-update-") as tempdir:
    shutil.copy2(extract_dir / "groups.json", DATA_DIR / "groups.json")
    shutil.copy2(extract_dir / "categories.json", DATA_DIR / "categories.json")

-    with zipfile.ZipFile(DATA_DIR / "wappalyzer.xpi", "w", zipfile.ZIP_DEFLATED) as archive:
-        for path in sorted(extract_dir.rglob("*")):
-            if path.is_file():
-                archive.write(path, path.relative_to(extract_dir))
+    write_chromium_extension_archive(extract_dir, EXTENSION_ARCHIVE)
@@ -1,15 +1,11 @@
 FROM python:3.12-slim

-RUN apt-get update && apt-get install -y \
-    firefox-esr \
-    wget \
-    && rm -rf /var/lib/apt/lists/*
+ENV PYTHONUNBUFFERED=1

-RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.36.0/geckodriver-v0.36.0-linux64.tar.gz \
-    && tar -xvzf geckodriver-v0.36.0-linux64.tar.gz \
-    && mv geckodriver /usr/local/bin/ \
-    && rm geckodriver-v0.36.0-linux64.tar.gz
+WORKDIR /app
+COPY . /app

-RUN pip install --no-cache-dir wappalyzer
+RUN pip install --no-cache-dir . \
+    && python -m playwright install --with-deps chromium

 ENTRYPOINT ["wappalyzer"]
@@ -1,6 +1,6 @@
 # Wappalyzer Next

-This project is a command line tool and python library that uses [Wappalyzer](https://www.wappalyzer.com/) extension (and its fingerprints) to detect technologies. Other projects that emerged after the discontinuation of the official open-source project are using outdated fingerprints and lack accuracy when used on dynamic web apps. This project bypasses those limitations.
+This project is a command line tool and python library that uses the [Wappalyzer](https://www.wappalyzer.com/) browser extension and its fingerprints to detect technologies. Other projects that emerged after the discontinuation of the official open-source project are using outdated fingerprints and lack accuracy on dynamic web apps. This project bypasses those limitations by running the extension in Chromium through Playwright.

 ![demo](https://github.com/user-attachments/assets/7a51b034-c9a7-44e6-aa80-2f8a23311e72)

@@ -11,46 +11,33 @@ This project is a command line tool and python library that uses [Wappalyzer](ht

 ## Installation

-Before installing wappalyzer, you will need to install [Firefox](https://www.mozilla.org/en-US/firefox/windows/) and [geckodriver](https://github.com/mozilla/geckodriver/releases). Below are detailed steps for setting up geckodriver but you may use google/youtube for help.
-<details>
-<summary>Setting up geckodriver</summary>
+After installing the Python package, install Playwright's Chromium browser:

-### Step 1: Download GeckoDriver
-1. Visit the official GeckoDriver releases page on GitHub:  
-   [https://github.com/mozilla/geckodriver/releases](https://github.com/mozilla/geckodriver/releases)
-2. Download the version compatible with your system:
-   - For Windows: `geckodriver-vX.XX.X-win64.zip`
-   - For macOS: `geckodriver-vX.XX.X-macos.tar.gz`
-   - For Linux: `geckodriver-vX.XX.X-linux64.tar.gz`
-3. Extract the downloaded file to a folder of your choice.
+```bash
+python -m playwright install chromium
+```

-### Step 2: Add GeckoDriver to the System Path
-To ensure Selenium can locate the GeckoDriver executable:
- **Windows**:
-  1. Move the `geckodriver.exe` to a directory (e.g., `C:\WebDrivers\`).
-  2. Add this directory to the system's PATH:
-     - Open **Environment Variables**.
-     - Under **System Variables**, find and select the `Path` variable, then click **Edit**.
-     - Click **New** and enter the directory path where `geckodriver.exe` is stored.
-     - Click **OK** to save.
- **macOS/Linux**:
-  1. Move the `geckodriver` file to `/usr/local/bin/` or another directory in your PATH.
-  2. Use the following command in the terminal:
-     ```bash
-     sudo mv geckodriver /usr/local/bin/
-     ```
-     Ensure `/usr/local/bin/` is in your PATH.
-</details>
+In minimal Linux containers, install Chromium's system dependencies as well:
+
+```bash
+python -m playwright install-deps chromium
+```


 #### Install as a command-line tool
 ```bash
 pipx install wappalyzer
+pipx run --spec playwright playwright install chromium
 ```

 #### Install as a library
 To use it as a library, install it with `pip` inside an isolated container e.g. `venv` or `docker`. You may also `--break-system-packages` to do a 'regular' install but it is not recommended.

+```bash
+pip install wappalyzer
+python -m playwright install chromium
+```
+
 #### Install with docker
 <details><summary>Steps</summary>

@@ -62,7 +49,7 @@ cd wappalyzer-next

 2. Build and run with Docker Compose:
 ```bash
-docker compose up -d
+docker compose build
 ```

 3. To scan URLs using the Docker container:
@@ -71,9 +58,9 @@ docker compose up -d
 ```bash
 docker compose run --rm wappalyzer -i https://example.com
 ```
- Scan Multiple URLs from a file:
+- Scan multiple URLs from a file:
 ```bash
-docker compose run --rm wappalyzer -i https://example.com -oJ output.json
+docker compose run --rm wappalyzer -i urls.txt -w 3 -oJ output.json
 ```
 </details>

@@ -81,12 +68,14 @@ docker compose run --rm wappalyzer -i https://example.com -oJ output.json
 Some common usage examples are given below, refer to list of all options for more information.

 - Scan a single URL: `wappalyzer -i https://example.com`
- Scan multiple URLs from a file: `wappalyzer -i urls.txt -w 10`
+- Scan multiple URLs from a file: `wappalyzer -i urls.txt -w 3`
 - Set page-load timeout for full scans: `wappalyzer -i urls.txt -t 15`
 - Scan with authentication: `wappalyzer -i https://example.com -c "sessionid=abc123; token=xyz789"`
 - Export results to JSON: `wappalyzer -i https://example.com -oJ results.json`
 - Export JSON to stdout: `wappalyzer -i https://example.com -oJ`

+When an output flag is used without a file, the report is written to stdout. Status lines, banner text, and errors are written to stderr.
+
 #### Options

 > Note: For accuracy use 'full' scan type (default). 'fast' and 'balanced' do not use browser emulation.
@@ -128,7 +117,7 @@ for url, technologies in results.items():
        print(f"  {name}{version}")
 ```

-The same scanner can also scan one URL at a time without reopening Firefox:
+The same scanner can also scan one URL at a time without reopening Chromium:

 ```python
 from wappalyzer import Wappalyzer
@@ -151,7 +140,7 @@ results = analyze(
 )
 ```

-Do not call the top-level `analyze()` function in a loop for large jobs. Use `Wappalyzer.analyze_many()` or `Wappalyzer.analyze()` on a reused scanner so Firefox and the Wappalyzer extension are not reloaded for every URL.
+Do not call the top-level `analyze()` function in a loop for large jobs. Use `Wappalyzer.analyze_many()` or `Wappalyzer.analyze()` on a reused scanner so Chromium and the Wappalyzer extension are not reloaded for every URL.

 #### analyze() Function Parameters

@@ -171,18 +160,27 @@ Returns a dictionary with the URL as key and detected technologies as value:
 ```json
 {
  "https://github.com": {
-    "Amazon S3": {"version": "", "confidence": 100, "categories": ["CDN"], "groups": ["Servers"]},
-    "lit-html": {"version": "1.1.2", "confidence": 100, "categories": ["JavaScript libraries"], "groups": ["Web development"]},
-    "React Router": {"version": "6", "confidence": 100, "categories": ["JavaScript frameworks"], "groups": ["Web development"]},
-  "https://google.com" : {},
-  "https://example.com" : {},
-}}
+    "Amazon S3": {
+      "version": "",
+      "confidence": 100,
+      "categories": ["CDN"],
+      "groups": ["Servers"]
+    },
+    "React Router": {
+      "version": "6",
+      "confidence": 100,
+      "categories": ["JavaScript frameworks"],
+      "groups": ["Web development"]
+    }
+  },
+  "https://example.com": {}
+}
 ```

 ### FAQ

-#### Why use Firefox instead of Chrome?
-Firefox extensions are .xpi files which are essentially zip files. This makes it easier to extract data and slightly modify the extension to make this tool work.
+#### Why Chromium and Playwright?
+The full scanner runs the Wappalyzer extension in Chromium through Playwright. Chromium extension support in Playwright is direct and does not require geckodriver or Selenium.

 #### What is the difference between 'fast', 'balanced', and 'full' scan types?
 - **fast**: Sends a single HTTP request to the URL. Doesn't use the extension.
@@ -1,9 +1,7 @@
 services:
  wappalyzer:
    build: .
+    shm_size: '2gb'
    volumes:
      - .:/app
    working_dir: /app
-  firefox:
-    image: selenium/standalone-firefox:latest
-    shm_size: '2gb'
@@ -10,7 +10,7 @@ with io.open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:

 setup(
    name='wappalyzer',
-    version='2.0.0rc1',
+    version='2.0.0',
    description='Wappalyzer-based tech stack detection library',
    long_description=desc,
    long_description_content_type='text/markdown',
@@ -25,7 +25,7 @@ setup(
        'requests',
        'urllib3',
        'huepy',
-        'selenium',
+        'playwright',
        'tldextract',
        'beautifulsoup4',
        'dnspython'
@@ -21,4 +21,4 @@ groups_db = {}
 with open(data_dir + '/groups.json', 'r') as f:
    groups_db = json.load(f)

-extension_path = data_dir + '/wappalyzer.xpi'
+extension_path = data_dir + '/wappalyzer-extension.zip'
@@ -7,7 +7,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

 def get_response(url, cookie=None, **kwargs):
    headers = {
-        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0",
+        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "deflate",
@@ -1,5 +1,6 @@
+import asyncio
+import concurrent.futures
 import threading
-from queue import Empty, Queue

 from wappalyzer.browser.analyzer import (
    DriverPool,
@@ -10,9 +11,135 @@ from wappalyzer.browser.analyzer import (
 from wappalyzer.core.analyzer import http_scan


+class _LoopRunner:
+    def __init__(self):
+        self.loop = asyncio.new_event_loop()
+        self.thread = threading.Thread(target=self._run, daemon=True)
+        self.thread.start()
+
+    def _run(self):
+        asyncio.set_event_loop(self.loop)
+        self.loop.run_forever()
+
+    def run(self, coro):
+        future = asyncio.run_coroutine_threadsafe(coro, self.loop)
+
+        try:
+            return future.result()
+        except KeyboardInterrupt:
+            future.cancel()
+            raise
+
+    def close(self):
+        self.loop.call_soon_threadsafe(self.loop.stop)
+        self.thread.join()
+
+
+class _FullScanBackend:
+    MAX_BROWSER_WORKERS = 3
+
+    def __init__(self, workers=1, timeout=30):
+        self.workers = workers
+        self.timeout = timeout
+        self.pool = None
+        self.pool_size = 0
+
+    async def ensure_pool(self, size):
+        if self.pool:
+            if size > self.pool_size:
+                await self.pool.grow_to(size)
+                self.pool_size = size
+
+            return
+
+        pool = DriverPool(size=size, timeout=self.timeout)
+
+        try:
+            await pool.start()
+        except Exception:
+            await pool.cleanup()
+            raise
+
+        self.pool = pool
+        self.pool_size = size
+
+    async def analyze_url(self, url, cookie=None):
+        await self.ensure_pool(1)
+
+        async with self.pool.get_driver() as driver:
+            if cookie:
+                for cookie_dict in cookie_to_cookies(cookie):
+                    driver.add_cookie(cookie_dict)
+
+            result_url, detections = await process_url(driver, url)
+
+        return result_url, merge_technologies(detections)
+
+    async def analyze_many(self, urls, cookie=None, on_result=None, on_error=None):
+        urls = [url for url in urls if url]
+
+        if not urls:
+            return {}
+
+        worker_count = min(self.workers, self.MAX_BROWSER_WORKERS, len(urls))
+        await self.ensure_pool(worker_count)
+
+        queue = asyncio.Queue()
+        results = {}
+
+        for url in urls:
+            queue.put_nowait(url)
+
+        async def worker():
+            while True:
+                try:
+                    url = queue.get_nowait()
+                except asyncio.QueueEmpty:
+                    break
+
+                result_url = url
+                technologies = {}
+                error = None
+
+                try:
+                    result_url, technologies = await self.analyze_url(url, cookie)
+                except Exception as exc:
+                    error = exc
+
+                results[result_url] = technologies
+
+                if error and on_error:
+                    on_error(result_url, error)
+
+                if on_result:
+                    on_result(result_url, technologies)
+
+                queue.task_done()
+
+        workers = [
+            asyncio.create_task(worker())
+            for _ in range(worker_count)
+        ]
+
+        try:
+            await asyncio.gather(*workers)
+        except asyncio.CancelledError:
+            for worker_task in workers:
+                worker_task.cancel()
+
+            raise
+
+        return results
+
+    async def close(self):
+        if self.pool:
+            await self.pool.cleanup()
+            self.pool = None
+            self.pool_size = 0
+
+
 class Wappalyzer:
    SUPPORTED_SCAN_TYPES = {"fast", "balanced", "full"}
-    MAX_BROWSER_WORKERS = 3

    def __init__(self, scan_type="full", workers=1, cookie=None, timeout=30):
        scan_type = scan_type.lower()
@@ -33,9 +160,9 @@ class Wappalyzer:
        self.workers = workers
        self.cookie = cookie
        self.timeout = timeout
-        self._driver_pool = None
-        self._driver_pool_size = 0
        self._closed = False
+        self._runner = None
+        self._full_backend = None
        self._lock = threading.RLock()

    def __enter__(self):
@@ -51,12 +178,18 @@ class Wappalyzer:
                return

            self._closed = True
-            driver_pool = self._driver_pool
-            self._driver_pool = None
-            self._driver_pool_size = 0
+            runner = self._runner
+            backend = self._full_backend
+            self._runner = None
+            self._full_backend = None

-        if driver_pool:
-            driver_pool.cleanup()
+        if runner and backend:
+            try:
+                runner.run(backend.close())
+            finally:
+                runner.close()
+        elif runner:
+            runner.close()

    def analyze(self, url, cookie=None):
        result_url, technologies = self._analyze_url(url, cookie)
@@ -69,143 +202,89 @@ class Wappalyzer:
        if not urls:
            return {}

-        worker_count = self._worker_count(len(urls))
+        self._check_open()

        if self.scan_type == "full":
-            self._ensure_driver_pool(worker_count)
+            return self._full_runner().run(
+                self._full_backend.analyze_many(
+                    urls,
+                    cookie=self._effective_cookie(cookie),
+                    on_result=on_result,
+                    on_error=on_error,
+                )
+            )

-        url_queue = Queue()
-        result_queue = Queue()
-        stop_event = threading.Event()
-        results = {}
-
-        for url in urls:
-            url_queue.put(url)
-
-        def worker():
-            while not stop_event.is_set():
-                try:
-                    original_url = url_queue.get_nowait()
-                except Empty:
-                    break
-
-                result_url = original_url
-                technologies = {}
-                error = None
-
-                try:
-                    result_url, technologies = self._analyze_url(original_url, cookie)
-                except Exception as exc:
-                    error = exc
-                finally:
-                    result_queue.put((result_url, technologies or {}, error))
-                    url_queue.task_done()
-
-        threads = [
-            threading.Thread(target=worker)
-            for _ in range(worker_count)
-        ]
-        interrupted = False
-        processed = 0
-
-        for thread in threads:
-            thread.start()
-
-        try:
-            while processed < len(urls):
-                try:
-                    result_url, technologies, error = result_queue.get(timeout=0.1)
-                except Empty:
-                    if all(not thread.is_alive() for thread in threads):
-                        break
-
-                    continue
-
-                processed += 1
-                results[result_url] = technologies
-
-                if error and on_error:
-                    on_error(result_url, error)
-
-                if on_result:
-                    on_result(result_url, technologies)
-
-                result_queue.task_done()
-        except KeyboardInterrupt:
-            interrupted = True
-            stop_event.set()
-        finally:
-            for thread in threads:
-                thread.join()
-
-            while not result_queue.empty():
-                result_url, technologies, error = result_queue.get()
-                results[result_url] = technologies
-
-                if error and on_error:
-                    on_error(result_url, error)
-
-                if on_result:
-                    on_result(result_url, technologies)
-
-                result_queue.task_done()
-
-        if interrupted:
-            raise KeyboardInterrupt
-
-        return results
+        return self._analyze_many_http(
+            urls,
+            cookie=self._effective_cookie(cookie),
+            on_result=on_result,
+            on_error=on_error,
+        )

    def _check_open(self):
        if self._closed:
            raise RuntimeError("Wappalyzer scanner is closed")

-    def _worker_count(self, url_count):
-        if self.scan_type == "full":
-            return min(self.workers, self.MAX_BROWSER_WORKERS, url_count)
-
-        return min(self.workers, url_count)
-
-    def _browser_worker_count(self):
-        return min(self.workers, self.MAX_BROWSER_WORKERS)
-
-    def _ensure_driver_pool(self, size):
-        with self._lock:
-            self._check_open()
-
-            if self._driver_pool:
-                if size > self._driver_pool_size:
-                    self._driver_pool.grow_to(size)
-                    self._driver_pool_size = size
-
-                return self._driver_pool
-
-            self._driver_pool = DriverPool(size=size, timeout=self.timeout)
-            self._driver_pool_size = size
-
-            return self._driver_pool
-
    def _effective_cookie(self, cookie):
        return self.cookie if cookie is None else cookie

+    def _full_runner(self):
+        with self._lock:
+            self._check_open()
+
+            if not self._runner:
+                self._runner = _LoopRunner()
+                self._full_backend = _FullScanBackend(
+                    workers=self.workers,
+                    timeout=self.timeout,
+                )
+
+            return self._runner
+
    def _analyze_url(self, url, cookie=None):
        self._check_open()
+        cookie = self._effective_cookie(cookie)

        if self.scan_type == "full":
-            return self._analyze_full_url(url, self._effective_cookie(cookie))
+            return self._full_runner().run(
+                self._full_backend.analyze_url(url, cookie=cookie)
+            )

-        return url, http_scan(url, self.scan_type, self._effective_cookie(cookie))
+        return url, http_scan(url, self.scan_type, cookie)

-    def _analyze_full_url(self, url, cookie=None):
-        driver_pool = self._ensure_driver_pool(self._browser_worker_count())
+    def _analyze_many_http(self, urls, cookie=None, on_result=None, on_error=None):
+        worker_count = min(self.workers, len(urls))
+        results = {}

-        with driver_pool.get_driver() as driver:
-            if cookie:
-                for cookie_dict in cookie_to_cookies(cookie):
-                    driver.add_cookie(cookie_dict)
+        def scan(url):
+            return url, http_scan(url, self.scan_type, cookie)

-            result_url, detections = process_url(driver, url)
+        with concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) as executor:
+            future_to_url = {
+                executor.submit(scan, url): url
+                for url in urls
+            }

-        return result_url, merge_technologies(detections)
+            for future in concurrent.futures.as_completed(future_to_url):
+                url = future_to_url[future]
+                result_url = url
+                technologies = {}
+                error = None
+
+                try:
+                    result_url, technologies = future.result()
+                except Exception as exc:
+                    error = exc
+
+                results[result_url] = technologies
+
+                if error and on_error:
+                    on_error(result_url, error)
+
+                if on_result:
+                    on_result(result_url, technologies)
+
+        return results


 Scanner = Wappalyzer