This commit is contained in:
Somdev Sangwan
2026-05-02 01:46:00 +05:30
parent 2ee35b0c17
commit 6c5fa8a003
10 changed files with 934 additions and 1045 deletions
+89 -4
View File
@@ -10,6 +10,7 @@ from pathlib import Path
URL = "https://addons.mozilla.org/firefox/downloads/latest/wappalyzer/platform:2/wappalyzer.xpi"
REPO_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = REPO_ROOT / "wappalyzer" / "data"
EXTENSION_ARCHIVE = DATA_DIR / "wappalyzer-extension.zip"
PROMPT_BLOCK = re.compile(
r"^[ \t]*const current = await get(?:Cached)?Option\('version'\)\n"
@@ -31,6 +32,93 @@ def patch_index_js(content):
return content
def patch_manifest_for_chromium(manifest):
manifest = json.loads(json.dumps(manifest))
manifest.pop("browser_specific_settings", None)
manifest.pop("applications", None)
background = manifest.get("background")
if isinstance(background, dict) and background.get("service_worker"):
background.pop("scripts", None)
return manifest
def validate_manifest(manifest):
errors = []
if manifest.get("manifest_version") != 3:
errors.append("manifest_version must be 3")
if manifest.get("action", {}).get("default_popup") != "html/popup.html":
errors.append("action.default_popup must be html/popup.html")
if not manifest.get("background", {}).get("service_worker"):
errors.append("background.service_worker is required")
if "scripts" in manifest.get("background", {}):
errors.append("background.scripts must be removed for Chromium MV3")
permissions = set(manifest.get("permissions", []))
host_permissions = set(manifest.get("host_permissions", []))
for permission in ("cookies", "storage", "tabs", "webRequest"):
if permission not in permissions:
errors.append(f"missing permission: {permission}")
for host_permission in ("http://*/*", "https://*/*"):
if host_permission not in host_permissions:
errors.append(f"missing host permission: {host_permission}")
if "browser_specific_settings" in manifest:
errors.append("browser_specific_settings must be removed")
if "applications" in manifest:
errors.append("applications must be removed")
if errors:
raise RuntimeError("Invalid Chromium extension manifest: " + "; ".join(errors))
def validate_extension_tree(extension_dir, require_technologies=False):
required_files = (
"manifest.json",
"html/popup.html",
"js/background.js",
"js/index.js",
"js/content.js",
)
for relative_path in required_files:
if not (extension_dir / relative_path).is_file():
raise RuntimeError(f"Missing extension file: {relative_path}")
if require_technologies and not any((extension_dir / "technologies").glob("*.json")):
raise RuntimeError("Missing extension technology fingerprint files")
validate_manifest(json.loads((extension_dir / "manifest.json").read_text(encoding="utf-8")))
def write_chromium_extension_archive(extension_dir, archive_path):
manifest_path = extension_dir / "manifest.json"
manifest_path.write_text(
json.dumps(
patch_manifest_for_chromium(
json.loads(manifest_path.read_text(encoding="utf-8"))
),
separators=(",", ":"),
),
encoding="utf-8",
)
validate_extension_tree(extension_dir, require_technologies=True)
with zipfile.ZipFile(archive_path, "w", zipfile.ZIP_DEFLATED) as archive:
for path in sorted(extension_dir.rglob("*")):
if path.is_file():
archive.write(path, path.relative_to(extension_dir))
DATA_DIR.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory(prefix="wappalyzer-update-") as tempdir:
@@ -60,7 +148,4 @@ with tempfile.TemporaryDirectory(prefix="wappalyzer-update-") as tempdir:
shutil.copy2(extract_dir / "groups.json", DATA_DIR / "groups.json")
shutil.copy2(extract_dir / "categories.json", DATA_DIR / "categories.json")
with zipfile.ZipFile(DATA_DIR / "wappalyzer.xpi", "w", zipfile.ZIP_DEFLATED) as archive:
for path in sorted(extract_dir.rglob("*")):
if path.is_file():
archive.write(path, path.relative_to(extract_dir))
write_chromium_extension_archive(extract_dir, EXTENSION_ARCHIVE)
+5 -9
View File
@@ -1,15 +1,11 @@
FROM python:3.12-slim
RUN apt-get update && apt-get install -y \
firefox-esr \
wget \
&& rm -rf /var/lib/apt/lists/*
ENV PYTHONUNBUFFERED=1
RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.36.0/geckodriver-v0.36.0-linux64.tar.gz \
&& tar -xvzf geckodriver-v0.36.0-linux64.tar.gz \
&& mv geckodriver /usr/local/bin/ \
&& rm geckodriver-v0.36.0-linux64.tar.gz
WORKDIR /app
COPY . /app
RUN pip install --no-cache-dir wappalyzer
RUN pip install --no-cache-dir . \
&& python -m playwright install --with-deps chromium
ENTRYPOINT ["wappalyzer"]
+41 -43
View File
@@ -1,6 +1,6 @@
# Wappalyzer Next
This project is a command line tool and python library that uses [Wappalyzer](https://www.wappalyzer.com/) extension (and its fingerprints) to detect technologies. Other projects that emerged after the discontinuation of the official open-source project are using outdated fingerprints and lack accuracy when used on dynamic web apps. This project bypasses those limitations.
This project is a command line tool and python library that uses the [Wappalyzer](https://www.wappalyzer.com/) browser extension and its fingerprints to detect technologies. Other projects that emerged after the discontinuation of the official open-source project are using outdated fingerprints and lack accuracy on dynamic web apps. This project bypasses those limitations by running the extension in Chromium through Playwright.
![demo](https://github.com/user-attachments/assets/7a51b034-c9a7-44e6-aa80-2f8a23311e72)
@@ -11,46 +11,33 @@ This project is a command line tool and python library that uses [Wappalyzer](ht
## Installation
Before installing wappalyzer, you will need to install [Firefox](https://www.mozilla.org/en-US/firefox/windows/) and [geckodriver](https://github.com/mozilla/geckodriver/releases). Below are detailed steps for setting up geckodriver but you may use google/youtube for help.
<details>
<summary>Setting up geckodriver</summary>
After installing the Python package, install Playwright's Chromium browser:
### Step 1: Download GeckoDriver
1. Visit the official GeckoDriver releases page on GitHub:
[https://github.com/mozilla/geckodriver/releases](https://github.com/mozilla/geckodriver/releases)
2. Download the version compatible with your system:
- For Windows: `geckodriver-vX.XX.X-win64.zip`
- For macOS: `geckodriver-vX.XX.X-macos.tar.gz`
- For Linux: `geckodriver-vX.XX.X-linux64.tar.gz`
3. Extract the downloaded file to a folder of your choice.
```bash
python -m playwright install chromium
```
### Step 2: Add GeckoDriver to the System Path
To ensure Selenium can locate the GeckoDriver executable:
- **Windows**:
1. Move the `geckodriver.exe` to a directory (e.g., `C:\WebDrivers\`).
2. Add this directory to the system's PATH:
- Open **Environment Variables**.
- Under **System Variables**, find and select the `Path` variable, then click **Edit**.
- Click **New** and enter the directory path where `geckodriver.exe` is stored.
- Click **OK** to save.
- **macOS/Linux**:
1. Move the `geckodriver` file to `/usr/local/bin/` or another directory in your PATH.
2. Use the following command in the terminal:
```bash
sudo mv geckodriver /usr/local/bin/
```
Ensure `/usr/local/bin/` is in your PATH.
</details>
In minimal Linux containers, install Chromium's system dependencies as well:
```bash
python -m playwright install-deps chromium
```
#### Install as a command-line tool
```bash
pipx install wappalyzer
pipx run --spec playwright playwright install chromium
```
#### Install as a library
To use it as a library, install it with `pip` inside an isolated container e.g. `venv` or `docker`. You may also `--break-system-packages` to do a 'regular' install but it is not recommended.
```bash
pip install wappalyzer
python -m playwright install chromium
```
#### Install with docker
<details><summary>Steps</summary>
@@ -62,7 +49,7 @@ cd wappalyzer-next
2. Build and run with Docker Compose:
```bash
docker compose up -d
docker compose build
```
3. To scan URLs using the Docker container:
@@ -71,9 +58,9 @@ docker compose up -d
```bash
docker compose run --rm wappalyzer -i https://example.com
```
- Scan Multiple URLs from a file:
- Scan multiple URLs from a file:
```bash
docker compose run --rm wappalyzer -i https://example.com -oJ output.json
docker compose run --rm wappalyzer -i urls.txt -w 3 -oJ output.json
```
</details>
@@ -81,12 +68,14 @@ docker compose run --rm wappalyzer -i https://example.com -oJ output.json
Some common usage examples are given below, refer to list of all options for more information.
- Scan a single URL: `wappalyzer -i https://example.com`
- Scan multiple URLs from a file: `wappalyzer -i urls.txt -w 10`
- Scan multiple URLs from a file: `wappalyzer -i urls.txt -w 3`
- Set page-load timeout for full scans: `wappalyzer -i urls.txt -t 15`
- Scan with authentication: `wappalyzer -i https://example.com -c "sessionid=abc123; token=xyz789"`
- Export results to JSON: `wappalyzer -i https://example.com -oJ results.json`
- Export JSON to stdout: `wappalyzer -i https://example.com -oJ`
When an output flag is used without a file, the report is written to stdout. Status lines, banner text, and errors are written to stderr.
#### Options
> Note: For accuracy use 'full' scan type (default). 'fast' and 'balanced' do not use browser emulation.
@@ -128,7 +117,7 @@ for url, technologies in results.items():
print(f" {name}{version}")
```
The same scanner can also scan one URL at a time without reopening Firefox:
The same scanner can also scan one URL at a time without reopening Chromium:
```python
from wappalyzer import Wappalyzer
@@ -151,7 +140,7 @@ results = analyze(
)
```
Do not call the top-level `analyze()` function in a loop for large jobs. Use `Wappalyzer.analyze_many()` or `Wappalyzer.analyze()` on a reused scanner so Firefox and the Wappalyzer extension are not reloaded for every URL.
Do not call the top-level `analyze()` function in a loop for large jobs. Use `Wappalyzer.analyze_many()` or `Wappalyzer.analyze()` on a reused scanner so Chromium and the Wappalyzer extension are not reloaded for every URL.
#### analyze() Function Parameters
@@ -171,18 +160,27 @@ Returns a dictionary with the URL as key and detected technologies as value:
```json
{
"https://github.com": {
"Amazon S3": {"version": "", "confidence": 100, "categories": ["CDN"], "groups": ["Servers"]},
"lit-html": {"version": "1.1.2", "confidence": 100, "categories": ["JavaScript libraries"], "groups": ["Web development"]},
"React Router": {"version": "6", "confidence": 100, "categories": ["JavaScript frameworks"], "groups": ["Web development"]},
"https://google.com" : {},
"https://example.com" : {},
}}
"Amazon S3": {
"version": "",
"confidence": 100,
"categories": ["CDN"],
"groups": ["Servers"]
},
"React Router": {
"version": "6",
"confidence": 100,
"categories": ["JavaScript frameworks"],
"groups": ["Web development"]
}
},
"https://example.com": {}
}
```
### FAQ
#### Why use Firefox instead of Chrome?
Firefox extensions are .xpi files which are essentially zip files. This makes it easier to extract data and slightly modify the extension to make this tool work.
#### Why Chromium and Playwright?
The full scanner runs the Wappalyzer extension in Chromium through Playwright. Chromium extension support in Playwright is direct and does not require geckodriver or Selenium.
#### What is the difference between 'fast', 'balanced', and 'full' scan types?
- **fast**: Sends a single HTTP request to the URL. Doesn't use the extension.
+1 -3
View File
@@ -1,9 +1,7 @@
services:
wappalyzer:
build: .
shm_size: '2gb'
volumes:
- .:/app
working_dir: /app
firefox:
image: selenium/standalone-firefox:latest
shm_size: '2gb'
+2 -2
View File
@@ -10,7 +10,7 @@ with io.open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
setup(
name='wappalyzer',
version='2.0.0rc1',
version='2.0.0',
description='Wappalyzer-based tech stack detection library',
long_description=desc,
long_description_content_type='text/markdown',
@@ -25,7 +25,7 @@ setup(
'requests',
'urllib3',
'huepy',
'selenium',
'playwright',
'tldextract',
'beautifulsoup4',
'dnspython'
File diff suppressed because it is too large Load Diff
+1 -1
View File
@@ -21,4 +21,4 @@ groups_db = {}
with open(data_dir + '/groups.json', 'r') as f:
groups_db = json.load(f)
extension_path = data_dir + '/wappalyzer.xpi'
extension_path = data_dir + '/wappalyzer-extension.zip'
+1 -1
View File
@@ -7,7 +7,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def get_response(url, cookie=None, **kwargs):
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "deflate",
+205 -126
View File
@@ -1,5 +1,6 @@
import asyncio
import concurrent.futures
import threading
from queue import Empty, Queue
from wappalyzer.browser.analyzer import (
DriverPool,
@@ -10,9 +11,135 @@ from wappalyzer.browser.analyzer import (
from wappalyzer.core.analyzer import http_scan
class _LoopRunner:
def __init__(self):
self.loop = asyncio.new_event_loop()
self.thread = threading.Thread(target=self._run, daemon=True)
self.thread.start()
def _run(self):
asyncio.set_event_loop(self.loop)
self.loop.run_forever()
def run(self, coro):
future = asyncio.run_coroutine_threadsafe(coro, self.loop)
try:
return future.result()
except KeyboardInterrupt:
future.cancel()
raise
def close(self):
self.loop.call_soon_threadsafe(self.loop.stop)
self.thread.join()
class _FullScanBackend:
MAX_BROWSER_WORKERS = 3
def __init__(self, workers=1, timeout=30):
self.workers = workers
self.timeout = timeout
self.pool = None
self.pool_size = 0
async def ensure_pool(self, size):
if self.pool:
if size > self.pool_size:
await self.pool.grow_to(size)
self.pool_size = size
return
pool = DriverPool(size=size, timeout=self.timeout)
try:
await pool.start()
except Exception:
await pool.cleanup()
raise
self.pool = pool
self.pool_size = size
async def analyze_url(self, url, cookie=None):
await self.ensure_pool(1)
async with self.pool.get_driver() as driver:
if cookie:
for cookie_dict in cookie_to_cookies(cookie):
driver.add_cookie(cookie_dict)
result_url, detections = await process_url(driver, url)
return result_url, merge_technologies(detections)
async def analyze_many(self, urls, cookie=None, on_result=None, on_error=None):
urls = [url for url in urls if url]
if not urls:
return {}
worker_count = min(self.workers, self.MAX_BROWSER_WORKERS, len(urls))
await self.ensure_pool(worker_count)
queue = asyncio.Queue()
results = {}
for url in urls:
queue.put_nowait(url)
async def worker():
while True:
try:
url = queue.get_nowait()
except asyncio.QueueEmpty:
break
result_url = url
technologies = {}
error = None
try:
result_url, technologies = await self.analyze_url(url, cookie)
except Exception as exc:
error = exc
results[result_url] = technologies
if error and on_error:
on_error(result_url, error)
if on_result:
on_result(result_url, technologies)
queue.task_done()
workers = [
asyncio.create_task(worker())
for _ in range(worker_count)
]
try:
await asyncio.gather(*workers)
except asyncio.CancelledError:
for worker_task in workers:
worker_task.cancel()
raise
return results
async def close(self):
if self.pool:
await self.pool.cleanup()
self.pool = None
self.pool_size = 0
class Wappalyzer:
SUPPORTED_SCAN_TYPES = {"fast", "balanced", "full"}
MAX_BROWSER_WORKERS = 3
def __init__(self, scan_type="full", workers=1, cookie=None, timeout=30):
scan_type = scan_type.lower()
@@ -33,9 +160,9 @@ class Wappalyzer:
self.workers = workers
self.cookie = cookie
self.timeout = timeout
self._driver_pool = None
self._driver_pool_size = 0
self._closed = False
self._runner = None
self._full_backend = None
self._lock = threading.RLock()
def __enter__(self):
@@ -51,12 +178,18 @@ class Wappalyzer:
return
self._closed = True
driver_pool = self._driver_pool
self._driver_pool = None
self._driver_pool_size = 0
runner = self._runner
backend = self._full_backend
self._runner = None
self._full_backend = None
if driver_pool:
driver_pool.cleanup()
if runner and backend:
try:
runner.run(backend.close())
finally:
runner.close()
elif runner:
runner.close()
def analyze(self, url, cookie=None):
result_url, technologies = self._analyze_url(url, cookie)
@@ -69,143 +202,89 @@ class Wappalyzer:
if not urls:
return {}
worker_count = self._worker_count(len(urls))
self._check_open()
if self.scan_type == "full":
self._ensure_driver_pool(worker_count)
return self._full_runner().run(
self._full_backend.analyze_many(
urls,
cookie=self._effective_cookie(cookie),
on_result=on_result,
on_error=on_error,
)
)
url_queue = Queue()
result_queue = Queue()
stop_event = threading.Event()
results = {}
for url in urls:
url_queue.put(url)
def worker():
while not stop_event.is_set():
try:
original_url = url_queue.get_nowait()
except Empty:
break
result_url = original_url
technologies = {}
error = None
try:
result_url, technologies = self._analyze_url(original_url, cookie)
except Exception as exc:
error = exc
finally:
result_queue.put((result_url, technologies or {}, error))
url_queue.task_done()
threads = [
threading.Thread(target=worker)
for _ in range(worker_count)
]
interrupted = False
processed = 0
for thread in threads:
thread.start()
try:
while processed < len(urls):
try:
result_url, technologies, error = result_queue.get(timeout=0.1)
except Empty:
if all(not thread.is_alive() for thread in threads):
break
continue
processed += 1
results[result_url] = technologies
if error and on_error:
on_error(result_url, error)
if on_result:
on_result(result_url, technologies)
result_queue.task_done()
except KeyboardInterrupt:
interrupted = True
stop_event.set()
finally:
for thread in threads:
thread.join()
while not result_queue.empty():
result_url, technologies, error = result_queue.get()
results[result_url] = technologies
if error and on_error:
on_error(result_url, error)
if on_result:
on_result(result_url, technologies)
result_queue.task_done()
if interrupted:
raise KeyboardInterrupt
return results
return self._analyze_many_http(
urls,
cookie=self._effective_cookie(cookie),
on_result=on_result,
on_error=on_error,
)
def _check_open(self):
if self._closed:
raise RuntimeError("Wappalyzer scanner is closed")
def _worker_count(self, url_count):
if self.scan_type == "full":
return min(self.workers, self.MAX_BROWSER_WORKERS, url_count)
return min(self.workers, url_count)
def _browser_worker_count(self):
return min(self.workers, self.MAX_BROWSER_WORKERS)
def _ensure_driver_pool(self, size):
with self._lock:
self._check_open()
if self._driver_pool:
if size > self._driver_pool_size:
self._driver_pool.grow_to(size)
self._driver_pool_size = size
return self._driver_pool
self._driver_pool = DriverPool(size=size, timeout=self.timeout)
self._driver_pool_size = size
return self._driver_pool
def _effective_cookie(self, cookie):
return self.cookie if cookie is None else cookie
def _full_runner(self):
with self._lock:
self._check_open()
if not self._runner:
self._runner = _LoopRunner()
self._full_backend = _FullScanBackend(
workers=self.workers,
timeout=self.timeout,
)
return self._runner
def _analyze_url(self, url, cookie=None):
self._check_open()
cookie = self._effective_cookie(cookie)
if self.scan_type == "full":
return self._analyze_full_url(url, self._effective_cookie(cookie))
return self._full_runner().run(
self._full_backend.analyze_url(url, cookie=cookie)
)
return url, http_scan(url, self.scan_type, self._effective_cookie(cookie))
return url, http_scan(url, self.scan_type, cookie)
def _analyze_full_url(self, url, cookie=None):
driver_pool = self._ensure_driver_pool(self._browser_worker_count())
def _analyze_many_http(self, urls, cookie=None, on_result=None, on_error=None):
worker_count = min(self.workers, len(urls))
results = {}
with driver_pool.get_driver() as driver:
if cookie:
for cookie_dict in cookie_to_cookies(cookie):
driver.add_cookie(cookie_dict)
def scan(url):
return url, http_scan(url, self.scan_type, cookie)
result_url, detections = process_url(driver, url)
with concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) as executor:
future_to_url = {
executor.submit(scan, url): url
for url in urls
}
return result_url, merge_technologies(detections)
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
result_url = url
technologies = {}
error = None
try:
result_url, technologies = future.result()
except Exception as exc:
error = exc
results[result_url] = technologies
if error and on_error:
on_error(result_url, error)
if on_result:
on_result(result_url, technologies)
return results
Scanner = Wappalyzer