From a25a9975aea6a8b601dd8adafe372191bf06ecd7 Mon Sep 17 00:00:00 2001 From: Martin Date: Fri, 3 Oct 2025 13:55:43 +0200 Subject: [PATCH] Initialer Push von Martin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .claude/settings.local.json | 12 + .env.example | 6 + .gitignore | 25 + Dockerfile | 45 ++ README.md | 58 +++ composer.json | 16 + config/nginx/default.conf | 31 ++ docker-compose.yml | 57 +++ init.sql | 66 +++ src/api.php | 128 +++++ src/classes/Crawler.php | 286 +++++++++++ src/classes/Database.php | 32 ++ src/composer.json | 16 + src/composer.lock | 988 ++++++++++++++++++++++++++++++++++++ src/crawler-worker.php | 40 ++ src/index.php | 479 +++++++++++++++++ start.sh | 7 + 17 files changed, 2292 insertions(+) create mode 100644 .claude/settings.local.json create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 composer.json create mode 100644 config/nginx/default.conf create mode 100644 docker-compose.yml create mode 100644 init.sql create mode 100644 src/api.php create mode 100644 src/classes/Crawler.php create mode 100644 src/classes/Database.php create mode 100644 src/composer.json create mode 100644 src/composer.lock create mode 100644 src/crawler-worker.php create mode 100644 src/index.php create mode 100644 start.sh diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 0000000..96696d7 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,12 @@ +{ + "permissions": { + "allow": [ + "Bash(docker-compose up:*)", + "Bash(docker:*)", + "Bash(docker-compose:*)", + "Bash(git add:*)" + ], + "deny": [], + "ask": [] + } +} \ No newline at end of file diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..a09be7d --- /dev/null +++ b/.env.example @@ -0,0 +1,6 @@ +# Database Configuration +DB_HOST=mariadb +DB_NAME=app_database +DB_USER=app_user +DB_PASSWORD=app_password +DB_ROOT_PASSWORD=root_password diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e7e4f35 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# IDE +.idea/ +.vscode/ + +# Dependencies +vendor/ +node_modules/ + +# Environment +.env +.env.local + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log + +# Temporary files +*.tmp +*.cache + +# Docker +docker-compose.override.yml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9cbe72f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,45 @@ +FROM php:8.3-fpm + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + nginx \ + libpng-dev \ + libjpeg-dev \ + libfreetype6-dev \ + libzip-dev \ + zip \ + unzip \ + git \ + curl \ + && docker-php-ext-configure gd --with-freetype --with-jpeg \ + && docker-php-ext-install -j$(nproc) gd pdo pdo_mysql mysqli zip \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install Composer +COPY --from=composer:latest /usr/bin/composer /usr/bin/composer + +# Configure nginx +RUN rm -rf /etc/nginx/sites-enabled/default + +# Configure PHP-FPM +RUN sed -i 's/listen = 127.0.0.1:9000/listen = 9000/g' /usr/local/etc/php-fpm.d/www.conf + +# Set working directory +WORKDIR /var/www/html + +# Copy application files +COPY ./src /var/www/html + +# Set permissions +RUN chown -R www-data:www-data /var/www/html \ + && chmod -R 755 /var/www/html + +# Expose port 80 +EXPOSE 80 + +# Start script +COPY start.sh /start.sh +RUN chmod +x /start.sh + +CMD ["/start.sh"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..4516d23 --- /dev/null +++ b/README.md @@ -0,0 +1,58 @@ +# PHP Docker Anwendung + +Eine PHP-Anwendung mit MariaDB, die in Docker läuft. + +## Anforderungen + +- Docker +- Docker Compose + +## Installation & Start + +1. Container starten: +```bash +docker-compose up -d +``` + +2. Container stoppen: +```bash +docker-compose down +``` + +3. Container neu bauen: +```bash +docker-compose up -d --build +``` + +## Services + +- **PHP Anwendung**: http://localhost:8080 +- **phpMyAdmin**: http://localhost:8081 +- **MariaDB**: Port 3306 + +## Datenbank Zugangsdaten + +- **Host**: mariadb +- **Datenbank**: app_database +- **Benutzer**: app_user +- **Passwort**: app_password +- **Root Passwort**: root_password + +## Struktur + +``` +. +├── docker-compose.yml # Docker Compose Konfiguration +├── Dockerfile # PHP Container Image +├── start.sh # Container Start-Script +├── init.sql # Datenbank Initialisierung +├── config/ +│ └── nginx/ +│ └── default.conf # Nginx Konfiguration +└── src/ + └── index.php # Hauptanwendung +``` + +## Entwicklung + +Die Anwendungsdateien befinden sich im `src/` Verzeichnis und werden als Volume in den Container gemountet, sodass Änderungen sofort sichtbar sind. diff --git a/composer.json b/composer.json new file mode 100644 index 0000000..a5e4d0c --- /dev/null +++ b/composer.json @@ -0,0 +1,16 @@ +{ + "name": "web-crawler/app", + "description": "Web Crawler Application with Parallel Processing", + "type": "project", + "require": { + "php": "^8.3", + "guzzlehttp/guzzle": "^7.8", + "symfony/dom-crawler": "^7.0", + "symfony/css-selector": "^7.0" + }, + "autoload": { + "psr-4": { + "App\\": "classes/" + } + } +} diff --git a/config/nginx/default.conf b/config/nginx/default.conf new file mode 100644 index 0000000..5340d7a --- /dev/null +++ b/config/nginx/default.conf @@ -0,0 +1,31 @@ +server { + listen 80; + server_name localhost; + root /var/www/html; + index index.php index.html; + + error_log /var/log/nginx/error.log; + access_log /var/log/nginx/access.log; + + location / { + try_files $uri $uri/ /index.php?$query_string; + } + + location ~ \.php$ { + try_files $uri =404; + fastcgi_split_path_info ^(.+\.php)(/.+)$; + fastcgi_pass 127.0.0.1:9000; + fastcgi_index index.php; + include fastcgi_params; + fastcgi_param SCRIPT_FILENAME $document_root$fastcgi_script_name; + fastcgi_param PATH_INFO $fastcgi_path_info; + } + + location ~ /\.ht { + deny all; + } + + location ~ /\.git { + deny all; + } +} diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..4fcfb47 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,57 @@ +version: '3.8' + +services: + php: + build: + context: . + dockerfile: Dockerfile + container_name: php_app + ports: + - "8080:80" + volumes: + - ./src:/var/www/html + - ./config/nginx/default.conf:/etc/nginx/conf.d/default.conf + depends_on: + - mariadb + networks: + - app-network + + mariadb: + image: mariadb:11.5 + container_name: mariadb_db + restart: unless-stopped + environment: + MYSQL_ROOT_PASSWORD: root_password + MYSQL_DATABASE: app_database + MYSQL_USER: app_user + MYSQL_PASSWORD: app_password + ports: + - "3307:3306" + volumes: + - mariadb_data:/var/lib/mysql + - ./init.sql:/docker-entrypoint-initdb.d/init.sql + networks: + - app-network + + phpmyadmin: + image: phpmyadmin:latest + container_name: phpmyadmin + restart: unless-stopped + environment: + PMA_HOST: mariadb + PMA_PORT: 3306 + MYSQL_ROOT_PASSWORD: root_password + ports: + - "8081:80" + depends_on: + - mariadb + networks: + - app-network + +networks: + app-network: + driver: bridge + +volumes: + mariadb_data: + driver: local diff --git a/init.sql b/init.sql new file mode 100644 index 0000000..f8719ea --- /dev/null +++ b/init.sql @@ -0,0 +1,66 @@ +-- Database initialization script for Web Crawler + +-- Crawl Jobs Table +CREATE TABLE IF NOT EXISTS crawl_jobs ( + id INT AUTO_INCREMENT PRIMARY KEY, + domain VARCHAR(255) NOT NULL, + status ENUM('pending', 'running', 'completed', 'failed') DEFAULT 'pending', + total_pages INT DEFAULT 0, + total_links INT DEFAULT 0, + started_at TIMESTAMP NULL, + completed_at TIMESTAMP NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + INDEX idx_domain (domain), + INDEX idx_status (status) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- Pages Table +CREATE TABLE IF NOT EXISTS pages ( + id INT AUTO_INCREMENT PRIMARY KEY, + crawl_job_id INT NOT NULL, + url VARCHAR(2048) NOT NULL, + title VARCHAR(500), + status_code INT, + content_type VARCHAR(100), + crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE, + INDEX idx_crawl_job (crawl_job_id), + INDEX idx_url (url(255)), + UNIQUE KEY unique_job_url (crawl_job_id, url(255)) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- Links Table +CREATE TABLE IF NOT EXISTS links ( + id INT AUTO_INCREMENT PRIMARY KEY, + page_id INT NOT NULL, + crawl_job_id INT NOT NULL, + source_url VARCHAR(2048) NOT NULL, + target_url VARCHAR(2048) NOT NULL, + link_text VARCHAR(1000), + is_nofollow BOOLEAN DEFAULT FALSE, + is_internal BOOLEAN DEFAULT TRUE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (page_id) REFERENCES pages(id) ON DELETE CASCADE, + FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE, + INDEX idx_page (page_id), + INDEX idx_crawl_job (crawl_job_id), + INDEX idx_source_url (source_url(255)), + INDEX idx_target_url (target_url(255)), + INDEX idx_nofollow (is_nofollow) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- Queue Table for parallel processing +CREATE TABLE IF NOT EXISTS crawl_queue ( + id INT AUTO_INCREMENT PRIMARY KEY, + crawl_job_id INT NOT NULL, + url VARCHAR(2048) NOT NULL, + depth INT DEFAULT 0, + status ENUM('pending', 'processing', 'completed', 'failed') DEFAULT 'pending', + retry_count INT DEFAULT 0, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + processed_at TIMESTAMP NULL, + FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE, + INDEX idx_status (status), + INDEX idx_crawl_job (crawl_job_id), + UNIQUE KEY unique_job_url (crawl_job_id, url(255)) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; diff --git a/src/api.php b/src/api.php new file mode 100644 index 0000000..9d6f074 --- /dev/null +++ b/src/api.php @@ -0,0 +1,128 @@ +prepare("INSERT INTO crawl_jobs (domain, status) VALUES (?, 'pending')"); + $stmt->execute([$domain]); + $jobId = $db->lastInsertId(); + + // Start crawling in background (using exec for async) + $cmd = "php " . __DIR__ . "/crawler-worker.php $jobId > /dev/null 2>&1 &"; + exec($cmd); + + echo json_encode([ + 'success' => true, + 'job_id' => $jobId, + 'message' => 'Crawl job started' + ]); + break; + + case 'status': + $jobId = $_GET['job_id'] ?? 0; + $stmt = $db->prepare("SELECT * FROM crawl_jobs WHERE id = ?"); + $stmt->execute([$jobId]); + $job = $stmt->fetch(); + + if (!$job) { + throw new Exception('Job not found'); + } + + // Get queue statistics + $stmt = $db->prepare(" + SELECT + COUNT(*) as total, + SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending, + SUM(CASE WHEN status = 'processing' THEN 1 ELSE 0 END) as processing, + SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed, + SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed + FROM crawl_queue + WHERE crawl_job_id = ? + "); + $stmt->execute([$jobId]); + $queueStats = $stmt->fetch(); + + echo json_encode([ + 'success' => true, + 'job' => $job, + 'queue' => $queueStats + ]); + break; + + case 'jobs': + $stmt = $db->query("SELECT * FROM crawl_jobs ORDER BY created_at DESC LIMIT 50"); + $jobs = $stmt->fetchAll(); + + echo json_encode([ + 'success' => true, + 'jobs' => $jobs + ]); + break; + + case 'pages': + $jobId = $_GET['job_id'] ?? 0; + $stmt = $db->prepare("SELECT * FROM pages WHERE crawl_job_id = ? ORDER BY id DESC LIMIT 1000"); + $stmt->execute([$jobId]); + $pages = $stmt->fetchAll(); + + echo json_encode([ + 'success' => true, + 'pages' => $pages + ]); + break; + + case 'links': + $jobId = $_GET['job_id'] ?? 0; + $stmt = $db->prepare("SELECT * FROM links WHERE crawl_job_id = ? ORDER BY id DESC LIMIT 1000"); + $stmt->execute([$jobId]); + $links = $stmt->fetchAll(); + + echo json_encode([ + 'success' => true, + 'links' => $links + ]); + break; + + case 'delete': + $jobId = $_POST['job_id'] ?? 0; + $stmt = $db->prepare("DELETE FROM crawl_jobs WHERE id = ?"); + $stmt->execute([$jobId]); + + echo json_encode([ + 'success' => true, + 'message' => 'Job deleted' + ]); + break; + + default: + throw new Exception('Invalid action'); + } +} catch (Exception $e) { + http_response_code(400); + echo json_encode([ + 'success' => false, + 'error' => $e->getMessage() + ]); +} diff --git a/src/classes/Crawler.php b/src/classes/Crawler.php new file mode 100644 index 0000000..55d30ec --- /dev/null +++ b/src/classes/Crawler.php @@ -0,0 +1,286 @@ +db = Database::getInstance(); + $this->crawlJobId = $crawlJobId; + $this->client = new Client([ + 'timeout' => 30, + 'verify' => false, + 'headers' => [ + 'User-Agent' => 'WebCrawler/1.0' + ] + ]); + } + + public function start(string $startUrl): void { + $this->baseDomain = strtolower(parse_url($startUrl, PHP_URL_HOST)); + + // Update job status + $stmt = $this->db->prepare("UPDATE crawl_jobs SET status = 'running', started_at = NOW() WHERE id = ?"); + $stmt->execute([$this->crawlJobId]); + + // Normalize and add start URL to queue + $normalizedStartUrl = $this->normalizeUrl($startUrl); + $this->addToQueue($normalizedStartUrl, 0); + + // Process queue + $this->processQueue(); + + // Update job status + $this->updateJobStats(); + $stmt = $this->db->prepare("UPDATE crawl_jobs SET status = 'completed', completed_at = NOW() WHERE id = ?"); + $stmt->execute([$this->crawlJobId]); + } + + private function addToQueue(string $url, int $depth): void { + if (isset($this->visited[$url])) { + return; + } + + try { + $stmt = $this->db->prepare( + "INSERT IGNORE INTO crawl_queue (crawl_job_id, url, depth) VALUES (?, ?, ?)" + ); + $stmt->execute([$this->crawlJobId, $url, $depth]); + } catch (\Exception $e) { + // URL already in queue + } + } + + private function processQueue(): void { + while (true) { + // Get pending URLs + $stmt = $this->db->prepare( + "SELECT id, url, depth FROM crawl_queue + WHERE crawl_job_id = ? AND status = 'pending' + LIMIT ?" + ); + $stmt->execute([$this->crawlJobId, $this->concurrency]); + $urls = $stmt->fetchAll(); + + if (empty($urls)) { + break; + } + + $this->crawlBatch($urls); + } + } + + private function crawlBatch(array $urls): void { + $requests = function() use ($urls) { + foreach ($urls as $item) { + // Mark as processing + $stmt = $this->db->prepare("UPDATE crawl_queue SET status = 'processing' WHERE id = ?"); + $stmt->execute([$item['id']]); + + yield function() use ($item) { + return $this->client->getAsync($item['url']); + }; + } + }; + + $pool = new Pool($this->client, $requests(), [ + 'concurrency' => $this->concurrency, + 'fulfilled' => function ($response, $index) use ($urls) { + $item = $urls[$index]; + $this->handleResponse($item, $response); + }, + 'rejected' => function ($reason, $index) use ($urls) { + $item = $urls[$index]; + $this->handleError($item, $reason); + }, + ]); + + $pool->promise()->wait(); + } + + private function handleResponse(array $queueItem, $response): void { + $url = $queueItem['url']; + $depth = $queueItem['depth']; + + $this->visited[$url] = true; + + $statusCode = $response->getStatusCode(); + $contentType = $response->getHeaderLine('Content-Type'); + $body = $response->getBody()->getContents(); + + // Save page + $domCrawler = new DomCrawler($body, $url); + $title = $domCrawler->filter('title')->count() > 0 + ? $domCrawler->filter('title')->text() + : ''; + + $stmt = $this->db->prepare( + "INSERT INTO pages (crawl_job_id, url, title, status_code, content_type) + VALUES (?, ?, ?, ?, ?) + ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code)" + ); + + $stmt->execute([$this->crawlJobId, $url, $title, $statusCode, $contentType]); + $pageId = $this->db->lastInsertId(); + + // If pageId is 0, fetch it manually + if ($pageId == 0) { + $stmt = $this->db->prepare("SELECT id FROM pages WHERE crawl_job_id = ? AND url = ?"); + $stmt->execute([$this->crawlJobId, $url]); + $pageId = $stmt->fetchColumn(); + } + + // Extract and save links + if (str_contains($contentType, 'text/html')) { + echo "Extracting links from: $url (pageId: $pageId)\n"; + $this->extractLinks($domCrawler, $url, $pageId, $depth); + } else { + echo "Skipping link extraction - content type: $contentType\n"; + } + + // Mark as completed + $stmt = $this->db->prepare("UPDATE crawl_queue SET status = 'completed', processed_at = NOW() WHERE id = ?"); + $stmt->execute([$queueItem['id']]); + } + + private function extractLinks(DomCrawler $crawler, string $sourceUrl, int $pageId, int $depth): void { + $linkCount = 0; + $crawler->filter('a')->each(function (DomCrawler $node) use ($sourceUrl, $pageId, $depth, &$linkCount) { + try { + $linkCount++; + $href = $node->attr('href'); + if (!$href || $href === '#') { + return; + } + + // Convert relative URLs to absolute + $targetUrl = $this->makeAbsoluteUrl($href, $sourceUrl); + + // Get link text + $linkText = trim($node->text()); + + // Check nofollow + $rel = $node->attr('rel') ?? ''; + $isNofollow = str_contains($rel, 'nofollow'); + + // Check if internal (same domain, no subdomains) + $targetDomain = strtolower(parse_url($targetUrl, PHP_URL_HOST) ?? ''); + $isInternal = ($targetDomain === $this->baseDomain); + + // Save link + $stmt = $this->db->prepare( + "INSERT INTO links (page_id, crawl_job_id, source_url, target_url, link_text, is_nofollow, is_internal) + VALUES (?, ?, ?, ?, ?, ?, ?)" + ); + $stmt->execute([ + $pageId, + $this->crawlJobId, + $sourceUrl, + $targetUrl, + $linkText, + $isNofollow ? 1 : 0, + $isInternal ? 1 : 0 + ]); + + // Add to queue if internal and not nofollow + if ($isInternal && !$isNofollow && $depth < 50) { + // Normalize URL (remove fragment, trailing slash) + $normalizedUrl = $this->normalizeUrl($targetUrl); + $this->addToQueue($normalizedUrl, $depth + 1); + } + } catch (\Exception $e) { + echo "Error processing link: " . $e->getMessage() . "\n"; + } + }); + echo "Processed $linkCount links from $sourceUrl\n"; + } + + private function makeAbsoluteUrl(string $url, string $base): string { + if (filter_var($url, FILTER_VALIDATE_URL)) { + return $url; + } + + $parts = parse_url($base); + $scheme = $parts['scheme'] ?? 'http'; + $host = $parts['host'] ?? ''; + $path = $parts['path'] ?? '/'; + + if ($url[0] === '/') { + return "$scheme://$host$url"; + } + + $basePath = substr($path, 0, strrpos($path, '/') + 1); + return "$scheme://$host$basePath$url"; + } + + private function handleError(array $queueItem, $reason): void { + $stmt = $this->db->prepare( + "UPDATE crawl_queue SET status = 'failed', processed_at = NOW(), retry_count = retry_count + 1 WHERE id = ?" + ); + $stmt->execute([$queueItem['id']]); + } + + private function updateJobStats(): void { + $stmt = $this->db->prepare( + "UPDATE crawl_jobs SET + total_pages = (SELECT COUNT(*) FROM pages WHERE crawl_job_id = ?), + total_links = (SELECT COUNT(*) FROM links WHERE crawl_job_id = ?) + WHERE id = ?" + ); + $stmt->execute([$this->crawlJobId, $this->crawlJobId, $this->crawlJobId]); + } + + private function normalizeUrl(string $url): string { + // Parse URL + $parts = parse_url($url); + + if (!$parts) { + return $url; + } + + // Remove fragment + unset($parts['fragment']); + + // Normalize domain (add www if base domain has it, or remove if base doesn't) + if (isset($parts['host'])) { + // Always convert to lowercase + $parts['host'] = strtolower($parts['host']); + + // Match www pattern with base domain + $baseHasWww = str_starts_with($this->baseDomain, 'www.'); + $urlHasWww = str_starts_with($parts['host'], 'www.'); + + if ($baseHasWww && !$urlHasWww) { + $parts['host'] = 'www.' . $parts['host']; + } elseif (!$baseHasWww && $urlHasWww) { + $parts['host'] = substr($parts['host'], 4); + } + } + + // Normalize path - remove trailing slash except for root + if (isset($parts['path']) && $parts['path'] !== '/') { + $parts['path'] = rtrim($parts['path'], '/'); + } + + // Rebuild URL + $scheme = isset($parts['scheme']) ? $parts['scheme'] . '://' : ''; + $host = $parts['host'] ?? ''; + $port = isset($parts['port']) ? ':' . $parts['port'] : ''; + $path = $parts['path'] ?? '/'; + $query = isset($parts['query']) ? '?' . $parts['query'] : ''; + + return $scheme . $host . $port . $path . $query; + } +} diff --git a/src/classes/Database.php b/src/classes/Database.php new file mode 100644 index 0000000..9b95aba --- /dev/null +++ b/src/classes/Database.php @@ -0,0 +1,32 @@ + PDO::ERRMODE_EXCEPTION, + PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC, + PDO::ATTR_EMULATE_PREPARES => false, + ] + ); + } catch (PDOException $e) { + throw new \Exception("Database connection failed: " . $e->getMessage()); + } + } + return self::$instance; + } +} diff --git a/src/composer.json b/src/composer.json new file mode 100644 index 0000000..a5e4d0c --- /dev/null +++ b/src/composer.json @@ -0,0 +1,16 @@ +{ + "name": "web-crawler/app", + "description": "Web Crawler Application with Parallel Processing", + "type": "project", + "require": { + "php": "^8.3", + "guzzlehttp/guzzle": "^7.8", + "symfony/dom-crawler": "^7.0", + "symfony/css-selector": "^7.0" + }, + "autoload": { + "psr-4": { + "App\\": "classes/" + } + } +} diff --git a/src/composer.lock b/src/composer.lock new file mode 100644 index 0000000..8b9f621 --- /dev/null +++ b/src/composer.lock @@ -0,0 +1,988 @@ +{ + "_readme": [ + "This file locks the dependencies of your project to a known state", + "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", + "This file is @generated automatically" + ], + "content-hash": "695ff0f32c1617699df31c134df90401", + "packages": [ + { + "name": "guzzlehttp/guzzle", + "version": "7.10.0", + "source": { + "type": "git", + "url": "https://github.com/guzzle/guzzle.git", + "reference": "b51ac707cfa420b7bfd4e4d5e510ba8008e822b4" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/guzzle/guzzle/zipball/b51ac707cfa420b7bfd4e4d5e510ba8008e822b4", + "reference": "b51ac707cfa420b7bfd4e4d5e510ba8008e822b4", + "shasum": "" + }, + "require": { + "ext-json": "*", + "guzzlehttp/promises": "^2.3", + "guzzlehttp/psr7": "^2.8", + "php": "^7.2.5 || ^8.0", + "psr/http-client": "^1.0", + "symfony/deprecation-contracts": "^2.2 || ^3.0" + }, + "provide": { + "psr/http-client-implementation": "1.0" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.8.2", + "ext-curl": "*", + "guzzle/client-integration-tests": "3.0.2", + "php-http/message-factory": "^1.1", + "phpunit/phpunit": "^8.5.39 || ^9.6.20", + "psr/log": "^1.1 || ^2.0 || ^3.0" + }, + "suggest": { + "ext-curl": "Required for CURL handler support", + "ext-intl": "Required for Internationalized Domain Name (IDN) support", + "psr/log": "Required for using the Log middleware" + }, + "type": "library", + "extra": { + "bamarni-bin": { + "bin-links": true, + "forward-command": false + } + }, + "autoload": { + "files": [ + "src/functions_include.php" + ], + "psr-4": { + "GuzzleHttp\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + }, + { + "name": "Michael Dowling", + "email": "mtdowling@gmail.com", + "homepage": "https://github.com/mtdowling" + }, + { + "name": "Jeremy Lindblom", + "email": "jeremeamia@gmail.com", + "homepage": "https://github.com/jeremeamia" + }, + { + "name": "George Mponos", + "email": "gmponos@gmail.com", + "homepage": "https://github.com/gmponos" + }, + { + "name": "Tobias Nyholm", + "email": "tobias.nyholm@gmail.com", + "homepage": "https://github.com/Nyholm" + }, + { + "name": "Márk Sági-Kazár", + "email": "mark.sagikazar@gmail.com", + "homepage": "https://github.com/sagikazarmark" + }, + { + "name": "Tobias Schultze", + "email": "webmaster@tubo-world.de", + "homepage": "https://github.com/Tobion" + } + ], + "description": "Guzzle is a PHP HTTP client library", + "keywords": [ + "client", + "curl", + "framework", + "http", + "http client", + "psr-18", + "psr-7", + "rest", + "web service" + ], + "support": { + "issues": "https://github.com/guzzle/guzzle/issues", + "source": "https://github.com/guzzle/guzzle/tree/7.10.0" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://github.com/Nyholm", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/guzzlehttp/guzzle", + "type": "tidelift" + } + ], + "time": "2025-08-23T22:36:01+00:00" + }, + { + "name": "guzzlehttp/promises", + "version": "2.3.0", + "source": { + "type": "git", + "url": "https://github.com/guzzle/promises.git", + "reference": "481557b130ef3790cf82b713667b43030dc9c957" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/guzzle/promises/zipball/481557b130ef3790cf82b713667b43030dc9c957", + "reference": "481557b130ef3790cf82b713667b43030dc9c957", + "shasum": "" + }, + "require": { + "php": "^7.2.5 || ^8.0" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.8.2", + "phpunit/phpunit": "^8.5.44 || ^9.6.25" + }, + "type": "library", + "extra": { + "bamarni-bin": { + "bin-links": true, + "forward-command": false + } + }, + "autoload": { + "psr-4": { + "GuzzleHttp\\Promise\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + }, + { + "name": "Michael Dowling", + "email": "mtdowling@gmail.com", + "homepage": "https://github.com/mtdowling" + }, + { + "name": "Tobias Nyholm", + "email": "tobias.nyholm@gmail.com", + "homepage": "https://github.com/Nyholm" + }, + { + "name": "Tobias Schultze", + "email": "webmaster@tubo-world.de", + "homepage": "https://github.com/Tobion" + } + ], + "description": "Guzzle promises library", + "keywords": [ + "promise" + ], + "support": { + "issues": "https://github.com/guzzle/promises/issues", + "source": "https://github.com/guzzle/promises/tree/2.3.0" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://github.com/Nyholm", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/guzzlehttp/promises", + "type": "tidelift" + } + ], + "time": "2025-08-22T14:34:08+00:00" + }, + { + "name": "guzzlehttp/psr7", + "version": "2.8.0", + "source": { + "type": "git", + "url": "https://github.com/guzzle/psr7.git", + "reference": "21dc724a0583619cd1652f673303492272778051" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/guzzle/psr7/zipball/21dc724a0583619cd1652f673303492272778051", + "reference": "21dc724a0583619cd1652f673303492272778051", + "shasum": "" + }, + "require": { + "php": "^7.2.5 || ^8.0", + "psr/http-factory": "^1.0", + "psr/http-message": "^1.1 || ^2.0", + "ralouphie/getallheaders": "^3.0" + }, + "provide": { + "psr/http-factory-implementation": "1.0", + "psr/http-message-implementation": "1.0" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.8.2", + "http-interop/http-factory-tests": "0.9.0", + "phpunit/phpunit": "^8.5.44 || ^9.6.25" + }, + "suggest": { + "laminas/laminas-httphandlerrunner": "Emit PSR-7 responses" + }, + "type": "library", + "extra": { + "bamarni-bin": { + "bin-links": true, + "forward-command": false + } + }, + "autoload": { + "psr-4": { + "GuzzleHttp\\Psr7\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + }, + { + "name": "Michael Dowling", + "email": "mtdowling@gmail.com", + "homepage": "https://github.com/mtdowling" + }, + { + "name": "George Mponos", + "email": "gmponos@gmail.com", + "homepage": "https://github.com/gmponos" + }, + { + "name": "Tobias Nyholm", + "email": "tobias.nyholm@gmail.com", + "homepage": "https://github.com/Nyholm" + }, + { + "name": "Márk Sági-Kazár", + "email": "mark.sagikazar@gmail.com", + "homepage": "https://github.com/sagikazarmark" + }, + { + "name": "Tobias Schultze", + "email": "webmaster@tubo-world.de", + "homepage": "https://github.com/Tobion" + }, + { + "name": "Márk Sági-Kazár", + "email": "mark.sagikazar@gmail.com", + "homepage": "https://sagikazarmark.hu" + } + ], + "description": "PSR-7 message implementation that also provides common utility methods", + "keywords": [ + "http", + "message", + "psr-7", + "request", + "response", + "stream", + "uri", + "url" + ], + "support": { + "issues": "https://github.com/guzzle/psr7/issues", + "source": "https://github.com/guzzle/psr7/tree/2.8.0" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://github.com/Nyholm", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/guzzlehttp/psr7", + "type": "tidelift" + } + ], + "time": "2025-08-23T21:21:41+00:00" + }, + { + "name": "masterminds/html5", + "version": "2.10.0", + "source": { + "type": "git", + "url": "https://github.com/Masterminds/html5-php.git", + "reference": "fcf91eb64359852f00d921887b219479b4f21251" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Masterminds/html5-php/zipball/fcf91eb64359852f00d921887b219479b4f21251", + "reference": "fcf91eb64359852f00d921887b219479b4f21251", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "php": ">=5.3.0" + }, + "require-dev": { + "phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7 || ^8 || ^9" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.7-dev" + } + }, + "autoload": { + "psr-4": { + "Masterminds\\": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Matt Butcher", + "email": "technosophos@gmail.com" + }, + { + "name": "Matt Farina", + "email": "matt@mattfarina.com" + }, + { + "name": "Asmir Mustafic", + "email": "goetas@gmail.com" + } + ], + "description": "An HTML5 parser and serializer.", + "homepage": "http://masterminds.github.io/html5-php", + "keywords": [ + "HTML5", + "dom", + "html", + "parser", + "querypath", + "serializer", + "xml" + ], + "support": { + "issues": "https://github.com/Masterminds/html5-php/issues", + "source": "https://github.com/Masterminds/html5-php/tree/2.10.0" + }, + "time": "2025-07-25T09:04:22+00:00" + }, + { + "name": "psr/http-client", + "version": "1.0.3", + "source": { + "type": "git", + "url": "https://github.com/php-fig/http-client.git", + "reference": "bb5906edc1c324c9a05aa0873d40117941e5fa90" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/php-fig/http-client/zipball/bb5906edc1c324c9a05aa0873d40117941e5fa90", + "reference": "bb5906edc1c324c9a05aa0873d40117941e5fa90", + "shasum": "" + }, + "require": { + "php": "^7.0 || ^8.0", + "psr/http-message": "^1.0 || ^2.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "Psr\\Http\\Client\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "PHP-FIG", + "homepage": "https://www.php-fig.org/" + } + ], + "description": "Common interface for HTTP clients", + "homepage": "https://github.com/php-fig/http-client", + "keywords": [ + "http", + "http-client", + "psr", + "psr-18" + ], + "support": { + "source": "https://github.com/php-fig/http-client" + }, + "time": "2023-09-23T14:17:50+00:00" + }, + { + "name": "psr/http-factory", + "version": "1.1.0", + "source": { + "type": "git", + "url": "https://github.com/php-fig/http-factory.git", + "reference": "2b4765fddfe3b508ac62f829e852b1501d3f6e8a" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/php-fig/http-factory/zipball/2b4765fddfe3b508ac62f829e852b1501d3f6e8a", + "reference": "2b4765fddfe3b508ac62f829e852b1501d3f6e8a", + "shasum": "" + }, + "require": { + "php": ">=7.1", + "psr/http-message": "^1.0 || ^2.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "Psr\\Http\\Message\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "PHP-FIG", + "homepage": "https://www.php-fig.org/" + } + ], + "description": "PSR-17: Common interfaces for PSR-7 HTTP message factories", + "keywords": [ + "factory", + "http", + "message", + "psr", + "psr-17", + "psr-7", + "request", + "response" + ], + "support": { + "source": "https://github.com/php-fig/http-factory" + }, + "time": "2024-04-15T12:06:14+00:00" + }, + { + "name": "psr/http-message", + "version": "2.0", + "source": { + "type": "git", + "url": "https://github.com/php-fig/http-message.git", + "reference": "402d35bcb92c70c026d1a6a9883f06b2ead23d71" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/php-fig/http-message/zipball/402d35bcb92c70c026d1a6a9883f06b2ead23d71", + "reference": "402d35bcb92c70c026d1a6a9883f06b2ead23d71", + "shasum": "" + }, + "require": { + "php": "^7.2 || ^8.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "Psr\\Http\\Message\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "PHP-FIG", + "homepage": "https://www.php-fig.org/" + } + ], + "description": "Common interface for HTTP messages", + "homepage": "https://github.com/php-fig/http-message", + "keywords": [ + "http", + "http-message", + "psr", + "psr-7", + "request", + "response" + ], + "support": { + "source": "https://github.com/php-fig/http-message/tree/2.0" + }, + "time": "2023-04-04T09:54:51+00:00" + }, + { + "name": "ralouphie/getallheaders", + "version": "3.0.3", + "source": { + "type": "git", + "url": "https://github.com/ralouphie/getallheaders.git", + "reference": "120b605dfeb996808c31b6477290a714d356e822" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/ralouphie/getallheaders/zipball/120b605dfeb996808c31b6477290a714d356e822", + "reference": "120b605dfeb996808c31b6477290a714d356e822", + "shasum": "" + }, + "require": { + "php": ">=5.6" + }, + "require-dev": { + "php-coveralls/php-coveralls": "^2.1", + "phpunit/phpunit": "^5 || ^6.5" + }, + "type": "library", + "autoload": { + "files": [ + "src/getallheaders.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Ralph Khattar", + "email": "ralph.khattar@gmail.com" + } + ], + "description": "A polyfill for getallheaders.", + "support": { + "issues": "https://github.com/ralouphie/getallheaders/issues", + "source": "https://github.com/ralouphie/getallheaders/tree/develop" + }, + "time": "2019-03-08T08:55:37+00:00" + }, + { + "name": "symfony/css-selector", + "version": "v7.3.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/css-selector.git", + "reference": "601a5ce9aaad7bf10797e3663faefce9e26c24e2" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/css-selector/zipball/601a5ce9aaad7bf10797e3663faefce9e26c24e2", + "reference": "601a5ce9aaad7bf10797e3663faefce9e26c24e2", + "shasum": "" + }, + "require": { + "php": ">=8.2" + }, + "type": "library", + "autoload": { + "psr-4": { + "Symfony\\Component\\CssSelector\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Jean-François Simon", + "email": "jeanfrancois.simon@sensiolabs.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Converts CSS selectors to XPath expressions", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/css-selector/tree/v7.3.0" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2024-09-25T14:21:43+00:00" + }, + { + "name": "symfony/deprecation-contracts", + "version": "v3.6.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/deprecation-contracts.git", + "reference": "63afe740e99a13ba87ec199bb07bbdee937a5b62" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/deprecation-contracts/zipball/63afe740e99a13ba87ec199bb07bbdee937a5b62", + "reference": "63afe740e99a13ba87ec199bb07bbdee937a5b62", + "shasum": "" + }, + "require": { + "php": ">=8.1" + }, + "type": "library", + "extra": { + "thanks": { + "url": "https://github.com/symfony/contracts", + "name": "symfony/contracts" + }, + "branch-alias": { + "dev-main": "3.6-dev" + } + }, + "autoload": { + "files": [ + "function.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Nicolas Grekas", + "email": "p@tchwork.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "A generic function and convention to trigger deprecation notices", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/deprecation-contracts/tree/v3.6.0" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2024-09-25T14:21:43+00:00" + }, + { + "name": "symfony/dom-crawler", + "version": "v7.3.3", + "source": { + "type": "git", + "url": "https://github.com/symfony/dom-crawler.git", + "reference": "efa076ea0eeff504383ff0dcf827ea5ce15690ba" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/dom-crawler/zipball/efa076ea0eeff504383ff0dcf827ea5ce15690ba", + "reference": "efa076ea0eeff504383ff0dcf827ea5ce15690ba", + "shasum": "" + }, + "require": { + "masterminds/html5": "^2.6", + "php": ">=8.2", + "symfony/polyfill-ctype": "~1.8", + "symfony/polyfill-mbstring": "~1.0" + }, + "require-dev": { + "symfony/css-selector": "^6.4|^7.0" + }, + "type": "library", + "autoload": { + "psr-4": { + "Symfony\\Component\\DomCrawler\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Eases DOM navigation for HTML and XML documents", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/dom-crawler/tree/v7.3.3" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://github.com/nicolas-grekas", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2025-08-06T20:13:54+00:00" + }, + { + "name": "symfony/polyfill-ctype", + "version": "v1.33.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/polyfill-ctype.git", + "reference": "a3cc8b044a6ea513310cbd48ef7333b384945638" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/polyfill-ctype/zipball/a3cc8b044a6ea513310cbd48ef7333b384945638", + "reference": "a3cc8b044a6ea513310cbd48ef7333b384945638", + "shasum": "" + }, + "require": { + "php": ">=7.2" + }, + "provide": { + "ext-ctype": "*" + }, + "suggest": { + "ext-ctype": "For best performance" + }, + "type": "library", + "extra": { + "thanks": { + "url": "https://github.com/symfony/polyfill", + "name": "symfony/polyfill" + } + }, + "autoload": { + "files": [ + "bootstrap.php" + ], + "psr-4": { + "Symfony\\Polyfill\\Ctype\\": "" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Gert de Pagter", + "email": "BackEndTea@gmail.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony polyfill for ctype functions", + "homepage": "https://symfony.com", + "keywords": [ + "compatibility", + "ctype", + "polyfill", + "portable" + ], + "support": { + "source": "https://github.com/symfony/polyfill-ctype/tree/v1.33.0" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://github.com/nicolas-grekas", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2024-09-09T11:45:10+00:00" + }, + { + "name": "symfony/polyfill-mbstring", + "version": "v1.33.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/polyfill-mbstring.git", + "reference": "6d857f4d76bd4b343eac26d6b539585d2bc56493" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/polyfill-mbstring/zipball/6d857f4d76bd4b343eac26d6b539585d2bc56493", + "reference": "6d857f4d76bd4b343eac26d6b539585d2bc56493", + "shasum": "" + }, + "require": { + "ext-iconv": "*", + "php": ">=7.2" + }, + "provide": { + "ext-mbstring": "*" + }, + "suggest": { + "ext-mbstring": "For best performance" + }, + "type": "library", + "extra": { + "thanks": { + "url": "https://github.com/symfony/polyfill", + "name": "symfony/polyfill" + } + }, + "autoload": { + "files": [ + "bootstrap.php" + ], + "psr-4": { + "Symfony\\Polyfill\\Mbstring\\": "" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Nicolas Grekas", + "email": "p@tchwork.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony polyfill for the Mbstring extension", + "homepage": "https://symfony.com", + "keywords": [ + "compatibility", + "mbstring", + "polyfill", + "portable", + "shim" + ], + "support": { + "source": "https://github.com/symfony/polyfill-mbstring/tree/v1.33.0" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://github.com/nicolas-grekas", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2024-12-23T08:48:59+00:00" + } + ], + "packages-dev": [], + "aliases": [], + "minimum-stability": "stable", + "stability-flags": {}, + "prefer-stable": false, + "prefer-lowest": false, + "platform": { + "php": "^8.3" + }, + "platform-dev": {}, + "plugin-api-version": "2.6.0" +} diff --git a/src/crawler-worker.php b/src/crawler-worker.php new file mode 100644 index 0000000..3549145 --- /dev/null +++ b/src/crawler-worker.php @@ -0,0 +1,40 @@ +#!/usr/bin/env php +\n"); +} + +$jobId = (int)$argv[1]; + +try { + $db = Database::getInstance(); + + // Get job details + $stmt = $db->prepare("SELECT domain FROM crawl_jobs WHERE id = ?"); + $stmt->execute([$jobId]); + $job = $stmt->fetch(); + + if (!$job) { + die("Job not found\n"); + } + + echo "Starting crawl for: {$job['domain']}\n"; + + $crawler = new Crawler($jobId); + $crawler->start($job['domain']); + + echo "Crawl completed\n"; +} catch (Exception $e) { + echo "Error: " . $e->getMessage() . "\n"; + + // Mark job as failed + $db = Database::getInstance(); + $stmt = $db->prepare("UPDATE crawl_jobs SET status = 'failed' WHERE id = ?"); + $stmt->execute([$jobId]); +} diff --git a/src/index.php b/src/index.php new file mode 100644 index 0000000..0cc8fa3 --- /dev/null +++ b/src/index.php @@ -0,0 +1,479 @@ + + + + + + Web Crawler + + + +
+

🕷️ Web Crawler

+ +
+

Neue Domain crawlen

+
+ + +
+
+ +
+

Crawl Jobs

+ + + + + + + + + + + + + + + +
IDDomainStatusSeitenLinksGestartetAktionen
Lade...
+
+ + +
+ + + + diff --git a/start.sh b/start.sh new file mode 100644 index 0000000..1d1d8d7 --- /dev/null +++ b/start.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# Start PHP-FPM +php-fpm -D + +# Start Nginx in foreground +nginx -g 'daemon off;'