mirror of
https://github.com/s0md3v/wappalyzer-next.git
synced 2026-06-19 07:35:19 +00:00
2.0.0
This commit is contained in:
+89
-4
@@ -10,6 +10,7 @@ from pathlib import Path
|
||||
URL = "https://addons.mozilla.org/firefox/downloads/latest/wappalyzer/platform:2/wappalyzer.xpi"
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
DATA_DIR = REPO_ROOT / "wappalyzer" / "data"
|
||||
EXTENSION_ARCHIVE = DATA_DIR / "wappalyzer-extension.zip"
|
||||
|
||||
PROMPT_BLOCK = re.compile(
|
||||
r"^[ \t]*const current = await get(?:Cached)?Option\('version'\)\n"
|
||||
@@ -31,6 +32,93 @@ def patch_index_js(content):
|
||||
return content
|
||||
|
||||
|
||||
def patch_manifest_for_chromium(manifest):
|
||||
manifest = json.loads(json.dumps(manifest))
|
||||
manifest.pop("browser_specific_settings", None)
|
||||
manifest.pop("applications", None)
|
||||
|
||||
background = manifest.get("background")
|
||||
|
||||
if isinstance(background, dict) and background.get("service_worker"):
|
||||
background.pop("scripts", None)
|
||||
|
||||
return manifest
|
||||
|
||||
|
||||
def validate_manifest(manifest):
|
||||
errors = []
|
||||
|
||||
if manifest.get("manifest_version") != 3:
|
||||
errors.append("manifest_version must be 3")
|
||||
|
||||
if manifest.get("action", {}).get("default_popup") != "html/popup.html":
|
||||
errors.append("action.default_popup must be html/popup.html")
|
||||
|
||||
if not manifest.get("background", {}).get("service_worker"):
|
||||
errors.append("background.service_worker is required")
|
||||
|
||||
if "scripts" in manifest.get("background", {}):
|
||||
errors.append("background.scripts must be removed for Chromium MV3")
|
||||
|
||||
permissions = set(manifest.get("permissions", []))
|
||||
host_permissions = set(manifest.get("host_permissions", []))
|
||||
|
||||
for permission in ("cookies", "storage", "tabs", "webRequest"):
|
||||
if permission not in permissions:
|
||||
errors.append(f"missing permission: {permission}")
|
||||
|
||||
for host_permission in ("http://*/*", "https://*/*"):
|
||||
if host_permission not in host_permissions:
|
||||
errors.append(f"missing host permission: {host_permission}")
|
||||
|
||||
if "browser_specific_settings" in manifest:
|
||||
errors.append("browser_specific_settings must be removed")
|
||||
|
||||
if "applications" in manifest:
|
||||
errors.append("applications must be removed")
|
||||
|
||||
if errors:
|
||||
raise RuntimeError("Invalid Chromium extension manifest: " + "; ".join(errors))
|
||||
|
||||
|
||||
def validate_extension_tree(extension_dir, require_technologies=False):
|
||||
required_files = (
|
||||
"manifest.json",
|
||||
"html/popup.html",
|
||||
"js/background.js",
|
||||
"js/index.js",
|
||||
"js/content.js",
|
||||
)
|
||||
|
||||
for relative_path in required_files:
|
||||
if not (extension_dir / relative_path).is_file():
|
||||
raise RuntimeError(f"Missing extension file: {relative_path}")
|
||||
|
||||
if require_technologies and not any((extension_dir / "technologies").glob("*.json")):
|
||||
raise RuntimeError("Missing extension technology fingerprint files")
|
||||
|
||||
validate_manifest(json.loads((extension_dir / "manifest.json").read_text(encoding="utf-8")))
|
||||
|
||||
|
||||
def write_chromium_extension_archive(extension_dir, archive_path):
|
||||
manifest_path = extension_dir / "manifest.json"
|
||||
manifest_path.write_text(
|
||||
json.dumps(
|
||||
patch_manifest_for_chromium(
|
||||
json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
),
|
||||
separators=(",", ":"),
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
validate_extension_tree(extension_dir, require_technologies=True)
|
||||
|
||||
with zipfile.ZipFile(archive_path, "w", zipfile.ZIP_DEFLATED) as archive:
|
||||
for path in sorted(extension_dir.rglob("*")):
|
||||
if path.is_file():
|
||||
archive.write(path, path.relative_to(extension_dir))
|
||||
|
||||
|
||||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix="wappalyzer-update-") as tempdir:
|
||||
@@ -60,7 +148,4 @@ with tempfile.TemporaryDirectory(prefix="wappalyzer-update-") as tempdir:
|
||||
shutil.copy2(extract_dir / "groups.json", DATA_DIR / "groups.json")
|
||||
shutil.copy2(extract_dir / "categories.json", DATA_DIR / "categories.json")
|
||||
|
||||
with zipfile.ZipFile(DATA_DIR / "wappalyzer.xpi", "w", zipfile.ZIP_DEFLATED) as archive:
|
||||
for path in sorted(extract_dir.rglob("*")):
|
||||
if path.is_file():
|
||||
archive.write(path, path.relative_to(extract_dir))
|
||||
write_chromium_extension_archive(extract_dir, EXTENSION_ARCHIVE)
|
||||
|
||||
+5
-9
@@ -1,15 +1,11 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
firefox-esr \
|
||||
wget \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.36.0/geckodriver-v0.36.0-linux64.tar.gz \
|
||||
&& tar -xvzf geckodriver-v0.36.0-linux64.tar.gz \
|
||||
&& mv geckodriver /usr/local/bin/ \
|
||||
&& rm geckodriver-v0.36.0-linux64.tar.gz
|
||||
WORKDIR /app
|
||||
COPY . /app
|
||||
|
||||
RUN pip install --no-cache-dir wappalyzer
|
||||
RUN pip install --no-cache-dir . \
|
||||
&& python -m playwright install --with-deps chromium
|
||||
|
||||
ENTRYPOINT ["wappalyzer"]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Wappalyzer Next
|
||||
|
||||
This project is a command line tool and python library that uses [Wappalyzer](https://www.wappalyzer.com/) extension (and its fingerprints) to detect technologies. Other projects that emerged after the discontinuation of the official open-source project are using outdated fingerprints and lack accuracy when used on dynamic web apps. This project bypasses those limitations.
|
||||
This project is a command line tool and python library that uses the [Wappalyzer](https://www.wappalyzer.com/) browser extension and its fingerprints to detect technologies. Other projects that emerged after the discontinuation of the official open-source project are using outdated fingerprints and lack accuracy on dynamic web apps. This project bypasses those limitations by running the extension in Chromium through Playwright.
|
||||
|
||||

|
||||
|
||||
@@ -11,46 +11,33 @@ This project is a command line tool and python library that uses [Wappalyzer](ht
|
||||
|
||||
## Installation
|
||||
|
||||
Before installing wappalyzer, you will need to install [Firefox](https://www.mozilla.org/en-US/firefox/windows/) and [geckodriver](https://github.com/mozilla/geckodriver/releases). Below are detailed steps for setting up geckodriver but you may use google/youtube for help.
|
||||
<details>
|
||||
<summary>Setting up geckodriver</summary>
|
||||
After installing the Python package, install Playwright's Chromium browser:
|
||||
|
||||
### Step 1: Download GeckoDriver
|
||||
1. Visit the official GeckoDriver releases page on GitHub:
|
||||
[https://github.com/mozilla/geckodriver/releases](https://github.com/mozilla/geckodriver/releases)
|
||||
2. Download the version compatible with your system:
|
||||
- For Windows: `geckodriver-vX.XX.X-win64.zip`
|
||||
- For macOS: `geckodriver-vX.XX.X-macos.tar.gz`
|
||||
- For Linux: `geckodriver-vX.XX.X-linux64.tar.gz`
|
||||
3. Extract the downloaded file to a folder of your choice.
|
||||
```bash
|
||||
python -m playwright install chromium
|
||||
```
|
||||
|
||||
### Step 2: Add GeckoDriver to the System Path
|
||||
To ensure Selenium can locate the GeckoDriver executable:
|
||||
- **Windows**:
|
||||
1. Move the `geckodriver.exe` to a directory (e.g., `C:\WebDrivers\`).
|
||||
2. Add this directory to the system's PATH:
|
||||
- Open **Environment Variables**.
|
||||
- Under **System Variables**, find and select the `Path` variable, then click **Edit**.
|
||||
- Click **New** and enter the directory path where `geckodriver.exe` is stored.
|
||||
- Click **OK** to save.
|
||||
- **macOS/Linux**:
|
||||
1. Move the `geckodriver` file to `/usr/local/bin/` or another directory in your PATH.
|
||||
2. Use the following command in the terminal:
|
||||
```bash
|
||||
sudo mv geckodriver /usr/local/bin/
|
||||
```
|
||||
Ensure `/usr/local/bin/` is in your PATH.
|
||||
</details>
|
||||
In minimal Linux containers, install Chromium's system dependencies as well:
|
||||
|
||||
```bash
|
||||
python -m playwright install-deps chromium
|
||||
```
|
||||
|
||||
|
||||
#### Install as a command-line tool
|
||||
```bash
|
||||
pipx install wappalyzer
|
||||
pipx run --spec playwright playwright install chromium
|
||||
```
|
||||
|
||||
#### Install as a library
|
||||
To use it as a library, install it with `pip` inside an isolated container e.g. `venv` or `docker`. You may also `--break-system-packages` to do a 'regular' install but it is not recommended.
|
||||
|
||||
```bash
|
||||
pip install wappalyzer
|
||||
python -m playwright install chromium
|
||||
```
|
||||
|
||||
#### Install with docker
|
||||
<details><summary>Steps</summary>
|
||||
|
||||
@@ -62,7 +49,7 @@ cd wappalyzer-next
|
||||
|
||||
2. Build and run with Docker Compose:
|
||||
```bash
|
||||
docker compose up -d
|
||||
docker compose build
|
||||
```
|
||||
|
||||
3. To scan URLs using the Docker container:
|
||||
@@ -71,9 +58,9 @@ docker compose up -d
|
||||
```bash
|
||||
docker compose run --rm wappalyzer -i https://example.com
|
||||
```
|
||||
- Scan Multiple URLs from a file:
|
||||
- Scan multiple URLs from a file:
|
||||
```bash
|
||||
docker compose run --rm wappalyzer -i https://example.com -oJ output.json
|
||||
docker compose run --rm wappalyzer -i urls.txt -w 3 -oJ output.json
|
||||
```
|
||||
</details>
|
||||
|
||||
@@ -81,12 +68,14 @@ docker compose run --rm wappalyzer -i https://example.com -oJ output.json
|
||||
Some common usage examples are given below, refer to list of all options for more information.
|
||||
|
||||
- Scan a single URL: `wappalyzer -i https://example.com`
|
||||
- Scan multiple URLs from a file: `wappalyzer -i urls.txt -w 10`
|
||||
- Scan multiple URLs from a file: `wappalyzer -i urls.txt -w 3`
|
||||
- Set page-load timeout for full scans: `wappalyzer -i urls.txt -t 15`
|
||||
- Scan with authentication: `wappalyzer -i https://example.com -c "sessionid=abc123; token=xyz789"`
|
||||
- Export results to JSON: `wappalyzer -i https://example.com -oJ results.json`
|
||||
- Export JSON to stdout: `wappalyzer -i https://example.com -oJ`
|
||||
|
||||
When an output flag is used without a file, the report is written to stdout. Status lines, banner text, and errors are written to stderr.
|
||||
|
||||
#### Options
|
||||
|
||||
> Note: For accuracy use 'full' scan type (default). 'fast' and 'balanced' do not use browser emulation.
|
||||
@@ -128,7 +117,7 @@ for url, technologies in results.items():
|
||||
print(f" {name}{version}")
|
||||
```
|
||||
|
||||
The same scanner can also scan one URL at a time without reopening Firefox:
|
||||
The same scanner can also scan one URL at a time without reopening Chromium:
|
||||
|
||||
```python
|
||||
from wappalyzer import Wappalyzer
|
||||
@@ -151,7 +140,7 @@ results = analyze(
|
||||
)
|
||||
```
|
||||
|
||||
Do not call the top-level `analyze()` function in a loop for large jobs. Use `Wappalyzer.analyze_many()` or `Wappalyzer.analyze()` on a reused scanner so Firefox and the Wappalyzer extension are not reloaded for every URL.
|
||||
Do not call the top-level `analyze()` function in a loop for large jobs. Use `Wappalyzer.analyze_many()` or `Wappalyzer.analyze()` on a reused scanner so Chromium and the Wappalyzer extension are not reloaded for every URL.
|
||||
|
||||
#### analyze() Function Parameters
|
||||
|
||||
@@ -171,18 +160,27 @@ Returns a dictionary with the URL as key and detected technologies as value:
|
||||
```json
|
||||
{
|
||||
"https://github.com": {
|
||||
"Amazon S3": {"version": "", "confidence": 100, "categories": ["CDN"], "groups": ["Servers"]},
|
||||
"lit-html": {"version": "1.1.2", "confidence": 100, "categories": ["JavaScript libraries"], "groups": ["Web development"]},
|
||||
"React Router": {"version": "6", "confidence": 100, "categories": ["JavaScript frameworks"], "groups": ["Web development"]},
|
||||
"https://google.com" : {},
|
||||
"https://example.com" : {},
|
||||
}}
|
||||
"Amazon S3": {
|
||||
"version": "",
|
||||
"confidence": 100,
|
||||
"categories": ["CDN"],
|
||||
"groups": ["Servers"]
|
||||
},
|
||||
"React Router": {
|
||||
"version": "6",
|
||||
"confidence": 100,
|
||||
"categories": ["JavaScript frameworks"],
|
||||
"groups": ["Web development"]
|
||||
}
|
||||
},
|
||||
"https://example.com": {}
|
||||
}
|
||||
```
|
||||
|
||||
### FAQ
|
||||
|
||||
#### Why use Firefox instead of Chrome?
|
||||
Firefox extensions are .xpi files which are essentially zip files. This makes it easier to extract data and slightly modify the extension to make this tool work.
|
||||
#### Why Chromium and Playwright?
|
||||
The full scanner runs the Wappalyzer extension in Chromium through Playwright. Chromium extension support in Playwright is direct and does not require geckodriver or Selenium.
|
||||
|
||||
#### What is the difference between 'fast', 'balanced', and 'full' scan types?
|
||||
- **fast**: Sends a single HTTP request to the URL. Doesn't use the extension.
|
||||
|
||||
+1
-3
@@ -1,9 +1,7 @@
|
||||
services:
|
||||
wappalyzer:
|
||||
build: .
|
||||
shm_size: '2gb'
|
||||
volumes:
|
||||
- .:/app
|
||||
working_dir: /app
|
||||
firefox:
|
||||
image: selenium/standalone-firefox:latest
|
||||
shm_size: '2gb'
|
||||
|
||||
@@ -10,7 +10,7 @@ with io.open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
|
||||
|
||||
setup(
|
||||
name='wappalyzer',
|
||||
version='2.0.0rc1',
|
||||
version='2.0.0',
|
||||
description='Wappalyzer-based tech stack detection library',
|
||||
long_description=desc,
|
||||
long_description_content_type='text/markdown',
|
||||
@@ -25,7 +25,7 @@ setup(
|
||||
'requests',
|
||||
'urllib3',
|
||||
'huepy',
|
||||
'selenium',
|
||||
'playwright',
|
||||
'tldextract',
|
||||
'beautifulsoup4',
|
||||
'dnspython'
|
||||
|
||||
+589
-856
File diff suppressed because it is too large
Load Diff
@@ -21,4 +21,4 @@ groups_db = {}
|
||||
with open(data_dir + '/groups.json', 'r') as f:
|
||||
groups_db = json.load(f)
|
||||
|
||||
extension_path = data_dir + '/wappalyzer.xpi'
|
||||
extension_path = data_dir + '/wappalyzer-extension.zip'
|
||||
|
||||
@@ -7,7 +7,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
def get_response(url, cookie=None, **kwargs):
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0",
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Encoding": "deflate",
|
||||
|
||||
Binary file not shown.
+205
-126
@@ -1,5 +1,6 @@
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
import threading
|
||||
from queue import Empty, Queue
|
||||
|
||||
from wappalyzer.browser.analyzer import (
|
||||
DriverPool,
|
||||
@@ -10,9 +11,135 @@ from wappalyzer.browser.analyzer import (
|
||||
from wappalyzer.core.analyzer import http_scan
|
||||
|
||||
|
||||
class _LoopRunner:
|
||||
def __init__(self):
|
||||
self.loop = asyncio.new_event_loop()
|
||||
self.thread = threading.Thread(target=self._run, daemon=True)
|
||||
self.thread.start()
|
||||
|
||||
def _run(self):
|
||||
asyncio.set_event_loop(self.loop)
|
||||
self.loop.run_forever()
|
||||
|
||||
def run(self, coro):
|
||||
future = asyncio.run_coroutine_threadsafe(coro, self.loop)
|
||||
|
||||
try:
|
||||
return future.result()
|
||||
except KeyboardInterrupt:
|
||||
future.cancel()
|
||||
raise
|
||||
|
||||
def close(self):
|
||||
self.loop.call_soon_threadsafe(self.loop.stop)
|
||||
self.thread.join()
|
||||
|
||||
|
||||
class _FullScanBackend:
|
||||
MAX_BROWSER_WORKERS = 3
|
||||
|
||||
def __init__(self, workers=1, timeout=30):
|
||||
self.workers = workers
|
||||
self.timeout = timeout
|
||||
self.pool = None
|
||||
self.pool_size = 0
|
||||
|
||||
async def ensure_pool(self, size):
|
||||
if self.pool:
|
||||
if size > self.pool_size:
|
||||
await self.pool.grow_to(size)
|
||||
self.pool_size = size
|
||||
|
||||
return
|
||||
|
||||
pool = DriverPool(size=size, timeout=self.timeout)
|
||||
|
||||
try:
|
||||
await pool.start()
|
||||
except Exception:
|
||||
await pool.cleanup()
|
||||
raise
|
||||
|
||||
self.pool = pool
|
||||
self.pool_size = size
|
||||
|
||||
async def analyze_url(self, url, cookie=None):
|
||||
await self.ensure_pool(1)
|
||||
|
||||
async with self.pool.get_driver() as driver:
|
||||
if cookie:
|
||||
for cookie_dict in cookie_to_cookies(cookie):
|
||||
driver.add_cookie(cookie_dict)
|
||||
|
||||
result_url, detections = await process_url(driver, url)
|
||||
|
||||
return result_url, merge_technologies(detections)
|
||||
|
||||
async def analyze_many(self, urls, cookie=None, on_result=None, on_error=None):
|
||||
urls = [url for url in urls if url]
|
||||
|
||||
if not urls:
|
||||
return {}
|
||||
|
||||
worker_count = min(self.workers, self.MAX_BROWSER_WORKERS, len(urls))
|
||||
await self.ensure_pool(worker_count)
|
||||
|
||||
queue = asyncio.Queue()
|
||||
results = {}
|
||||
|
||||
for url in urls:
|
||||
queue.put_nowait(url)
|
||||
|
||||
async def worker():
|
||||
while True:
|
||||
try:
|
||||
url = queue.get_nowait()
|
||||
except asyncio.QueueEmpty:
|
||||
break
|
||||
|
||||
result_url = url
|
||||
technologies = {}
|
||||
error = None
|
||||
|
||||
try:
|
||||
result_url, technologies = await self.analyze_url(url, cookie)
|
||||
except Exception as exc:
|
||||
error = exc
|
||||
|
||||
results[result_url] = technologies
|
||||
|
||||
if error and on_error:
|
||||
on_error(result_url, error)
|
||||
|
||||
if on_result:
|
||||
on_result(result_url, technologies)
|
||||
|
||||
queue.task_done()
|
||||
|
||||
workers = [
|
||||
asyncio.create_task(worker())
|
||||
for _ in range(worker_count)
|
||||
]
|
||||
|
||||
try:
|
||||
await asyncio.gather(*workers)
|
||||
except asyncio.CancelledError:
|
||||
for worker_task in workers:
|
||||
worker_task.cancel()
|
||||
|
||||
raise
|
||||
|
||||
return results
|
||||
|
||||
async def close(self):
|
||||
if self.pool:
|
||||
await self.pool.cleanup()
|
||||
self.pool = None
|
||||
self.pool_size = 0
|
||||
|
||||
|
||||
class Wappalyzer:
|
||||
SUPPORTED_SCAN_TYPES = {"fast", "balanced", "full"}
|
||||
MAX_BROWSER_WORKERS = 3
|
||||
|
||||
def __init__(self, scan_type="full", workers=1, cookie=None, timeout=30):
|
||||
scan_type = scan_type.lower()
|
||||
@@ -33,9 +160,9 @@ class Wappalyzer:
|
||||
self.workers = workers
|
||||
self.cookie = cookie
|
||||
self.timeout = timeout
|
||||
self._driver_pool = None
|
||||
self._driver_pool_size = 0
|
||||
self._closed = False
|
||||
self._runner = None
|
||||
self._full_backend = None
|
||||
self._lock = threading.RLock()
|
||||
|
||||
def __enter__(self):
|
||||
@@ -51,12 +178,18 @@ class Wappalyzer:
|
||||
return
|
||||
|
||||
self._closed = True
|
||||
driver_pool = self._driver_pool
|
||||
self._driver_pool = None
|
||||
self._driver_pool_size = 0
|
||||
runner = self._runner
|
||||
backend = self._full_backend
|
||||
self._runner = None
|
||||
self._full_backend = None
|
||||
|
||||
if driver_pool:
|
||||
driver_pool.cleanup()
|
||||
if runner and backend:
|
||||
try:
|
||||
runner.run(backend.close())
|
||||
finally:
|
||||
runner.close()
|
||||
elif runner:
|
||||
runner.close()
|
||||
|
||||
def analyze(self, url, cookie=None):
|
||||
result_url, technologies = self._analyze_url(url, cookie)
|
||||
@@ -69,143 +202,89 @@ class Wappalyzer:
|
||||
if not urls:
|
||||
return {}
|
||||
|
||||
worker_count = self._worker_count(len(urls))
|
||||
self._check_open()
|
||||
|
||||
if self.scan_type == "full":
|
||||
self._ensure_driver_pool(worker_count)
|
||||
return self._full_runner().run(
|
||||
self._full_backend.analyze_many(
|
||||
urls,
|
||||
cookie=self._effective_cookie(cookie),
|
||||
on_result=on_result,
|
||||
on_error=on_error,
|
||||
)
|
||||
)
|
||||
|
||||
url_queue = Queue()
|
||||
result_queue = Queue()
|
||||
stop_event = threading.Event()
|
||||
results = {}
|
||||
|
||||
for url in urls:
|
||||
url_queue.put(url)
|
||||
|
||||
def worker():
|
||||
while not stop_event.is_set():
|
||||
try:
|
||||
original_url = url_queue.get_nowait()
|
||||
except Empty:
|
||||
break
|
||||
|
||||
result_url = original_url
|
||||
technologies = {}
|
||||
error = None
|
||||
|
||||
try:
|
||||
result_url, technologies = self._analyze_url(original_url, cookie)
|
||||
except Exception as exc:
|
||||
error = exc
|
||||
finally:
|
||||
result_queue.put((result_url, technologies or {}, error))
|
||||
url_queue.task_done()
|
||||
|
||||
threads = [
|
||||
threading.Thread(target=worker)
|
||||
for _ in range(worker_count)
|
||||
]
|
||||
interrupted = False
|
||||
processed = 0
|
||||
|
||||
for thread in threads:
|
||||
thread.start()
|
||||
|
||||
try:
|
||||
while processed < len(urls):
|
||||
try:
|
||||
result_url, technologies, error = result_queue.get(timeout=0.1)
|
||||
except Empty:
|
||||
if all(not thread.is_alive() for thread in threads):
|
||||
break
|
||||
|
||||
continue
|
||||
|
||||
processed += 1
|
||||
results[result_url] = technologies
|
||||
|
||||
if error and on_error:
|
||||
on_error(result_url, error)
|
||||
|
||||
if on_result:
|
||||
on_result(result_url, technologies)
|
||||
|
||||
result_queue.task_done()
|
||||
except KeyboardInterrupt:
|
||||
interrupted = True
|
||||
stop_event.set()
|
||||
finally:
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
while not result_queue.empty():
|
||||
result_url, technologies, error = result_queue.get()
|
||||
results[result_url] = technologies
|
||||
|
||||
if error and on_error:
|
||||
on_error(result_url, error)
|
||||
|
||||
if on_result:
|
||||
on_result(result_url, technologies)
|
||||
|
||||
result_queue.task_done()
|
||||
|
||||
if interrupted:
|
||||
raise KeyboardInterrupt
|
||||
|
||||
return results
|
||||
return self._analyze_many_http(
|
||||
urls,
|
||||
cookie=self._effective_cookie(cookie),
|
||||
on_result=on_result,
|
||||
on_error=on_error,
|
||||
)
|
||||
|
||||
def _check_open(self):
|
||||
if self._closed:
|
||||
raise RuntimeError("Wappalyzer scanner is closed")
|
||||
|
||||
def _worker_count(self, url_count):
|
||||
if self.scan_type == "full":
|
||||
return min(self.workers, self.MAX_BROWSER_WORKERS, url_count)
|
||||
|
||||
return min(self.workers, url_count)
|
||||
|
||||
def _browser_worker_count(self):
|
||||
return min(self.workers, self.MAX_BROWSER_WORKERS)
|
||||
|
||||
def _ensure_driver_pool(self, size):
|
||||
with self._lock:
|
||||
self._check_open()
|
||||
|
||||
if self._driver_pool:
|
||||
if size > self._driver_pool_size:
|
||||
self._driver_pool.grow_to(size)
|
||||
self._driver_pool_size = size
|
||||
|
||||
return self._driver_pool
|
||||
|
||||
self._driver_pool = DriverPool(size=size, timeout=self.timeout)
|
||||
self._driver_pool_size = size
|
||||
|
||||
return self._driver_pool
|
||||
|
||||
def _effective_cookie(self, cookie):
|
||||
return self.cookie if cookie is None else cookie
|
||||
|
||||
def _full_runner(self):
|
||||
with self._lock:
|
||||
self._check_open()
|
||||
|
||||
if not self._runner:
|
||||
self._runner = _LoopRunner()
|
||||
self._full_backend = _FullScanBackend(
|
||||
workers=self.workers,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
|
||||
return self._runner
|
||||
|
||||
def _analyze_url(self, url, cookie=None):
|
||||
self._check_open()
|
||||
cookie = self._effective_cookie(cookie)
|
||||
|
||||
if self.scan_type == "full":
|
||||
return self._analyze_full_url(url, self._effective_cookie(cookie))
|
||||
return self._full_runner().run(
|
||||
self._full_backend.analyze_url(url, cookie=cookie)
|
||||
)
|
||||
|
||||
return url, http_scan(url, self.scan_type, self._effective_cookie(cookie))
|
||||
return url, http_scan(url, self.scan_type, cookie)
|
||||
|
||||
def _analyze_full_url(self, url, cookie=None):
|
||||
driver_pool = self._ensure_driver_pool(self._browser_worker_count())
|
||||
def _analyze_many_http(self, urls, cookie=None, on_result=None, on_error=None):
|
||||
worker_count = min(self.workers, len(urls))
|
||||
results = {}
|
||||
|
||||
with driver_pool.get_driver() as driver:
|
||||
if cookie:
|
||||
for cookie_dict in cookie_to_cookies(cookie):
|
||||
driver.add_cookie(cookie_dict)
|
||||
def scan(url):
|
||||
return url, http_scan(url, self.scan_type, cookie)
|
||||
|
||||
result_url, detections = process_url(driver, url)
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) as executor:
|
||||
future_to_url = {
|
||||
executor.submit(scan, url): url
|
||||
for url in urls
|
||||
}
|
||||
|
||||
return result_url, merge_technologies(detections)
|
||||
for future in concurrent.futures.as_completed(future_to_url):
|
||||
url = future_to_url[future]
|
||||
result_url = url
|
||||
technologies = {}
|
||||
error = None
|
||||
|
||||
try:
|
||||
result_url, technologies = future.result()
|
||||
except Exception as exc:
|
||||
error = exc
|
||||
|
||||
results[result_url] = technologies
|
||||
|
||||
if error and on_error:
|
||||
on_error(result_url, error)
|
||||
|
||||
if on_result:
|
||||
on_result(result_url, technologies)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
Scanner = Wappalyzer
|
||||
|
||||
Reference in New Issue
Block a user