Merge conflict resolved in .gitignore

This commit is contained in:
2025-10-03 19:57:58 +02:00
20 changed files with 4263 additions and 1 deletions

6
.env.example Normal file
View File

@@ -0,0 +1,6 @@
# Database Configuration
DB_HOST=mariadb
DB_NAME=app_database
DB_USER=app_user
DB_PASSWORD=app_password
DB_ROOT_PASSWORD=root_password

26
.gitignore vendored
View File

@@ -1 +1,25 @@
/.idea/
# IDE
.idea/
.vscode/
# Dependencies
vendor/
node_modules/
# Environment
.env
.env.local
# OS
.DS_Store
Thumbs.db
# Logs
*.log
# Temporary files
*.tmp
*.cache
# Docker
docker-compose.override.yml

45
Dockerfile Normal file
View File

@@ -0,0 +1,45 @@
FROM php:8.3-fpm
# Install system dependencies
RUN apt-get update && apt-get install -y \
nginx \
libpng-dev \
libjpeg-dev \
libfreetype6-dev \
libzip-dev \
zip \
unzip \
git \
curl \
&& docker-php-ext-configure gd --with-freetype --with-jpeg \
&& docker-php-ext-install -j$(nproc) gd pdo pdo_mysql mysqli zip \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Install Composer
COPY --from=composer:latest /usr/bin/composer /usr/bin/composer
# Configure nginx
RUN rm -rf /etc/nginx/sites-enabled/default
# Configure PHP-FPM
RUN sed -i 's/listen = 127.0.0.1:9000/listen = 9000/g' /usr/local/etc/php-fpm.d/www.conf
# Set working directory
WORKDIR /var/www/html
# Copy application files
COPY ./src /var/www/html
# Set permissions
RUN chown -R www-data:www-data /var/www/html \
&& chmod -R 755 /var/www/html
# Expose port 80
EXPOSE 80
# Start script
COPY start.sh /start.sh
RUN chmod +x /start.sh
CMD ["/start.sh"]

58
README.md Normal file
View File

@@ -0,0 +1,58 @@
# PHP Docker Anwendung
Eine PHP-Anwendung mit MariaDB, die in Docker läuft.
## Anforderungen
- Docker
- Docker Compose
## Installation & Start
1. Container starten:
```bash
docker-compose up -d
```
2. Container stoppen:
```bash
docker-compose down
```
3. Container neu bauen:
```bash
docker-compose up -d --build
```
## Services
- **PHP Anwendung**: http://localhost:8080
- **phpMyAdmin**: http://localhost:8081
- **MariaDB**: Port 3306
## Datenbank Zugangsdaten
- **Host**: mariadb
- **Datenbank**: app_database
- **Benutzer**: app_user
- **Passwort**: app_password
- **Root Passwort**: root_password
## Struktur
```
.
├── docker-compose.yml # Docker Compose Konfiguration
├── Dockerfile # PHP Container Image
├── start.sh # Container Start-Script
├── init.sql # Datenbank Initialisierung
├── config/
│ └── nginx/
│ └── default.conf # Nginx Konfiguration
└── src/
└── index.php # Hauptanwendung
```
## Entwicklung
Die Anwendungsdateien befinden sich im `src/` Verzeichnis und werden als Volume in den Container gemountet, sodass Änderungen sofort sichtbar sind.

27
composer.json Normal file
View File

@@ -0,0 +1,27 @@
{
"name": "web-crawler/app",
"description": "Web Crawler Application with Parallel Processing",
"type": "project",
"require": {
"php": "^8.3",
"guzzlehttp/guzzle": "^7.8",
"symfony/dom-crawler": "^7.0",
"symfony/css-selector": "^7.0"
},
"require-dev": {
"phpunit/phpunit": "^11.0"
},
"autoload": {
"psr-4": {
"App\\": "classes/"
}
},
"autoload-dev": {
"psr-4": {
"Tests\\": "tests/"
}
},
"scripts": {
"test": "phpunit"
}
}

31
config/nginx/default.conf Normal file
View File

@@ -0,0 +1,31 @@
server {
listen 80;
server_name localhost;
root /var/www/html;
index index.php index.html;
error_log /var/log/nginx/error.log;
access_log /var/log/nginx/access.log;
location / {
try_files $uri $uri/ /index.php?$query_string;
}
location ~ \.php$ {
try_files $uri =404;
fastcgi_split_path_info ^(.+\.php)(/.+)$;
fastcgi_pass 127.0.0.1:9000;
fastcgi_index index.php;
include fastcgi_params;
fastcgi_param SCRIPT_FILENAME $document_root$fastcgi_script_name;
fastcgi_param PATH_INFO $fastcgi_path_info;
}
location ~ /\.ht {
deny all;
}
location ~ /\.git {
deny all;
}
}

57
docker-compose.yml Normal file
View File

@@ -0,0 +1,57 @@
version: '3.8'
services:
php:
build:
context: .
dockerfile: Dockerfile
container_name: php_app
ports:
- "8080:80"
volumes:
- ./src:/var/www/html
- ./config/nginx/default.conf:/etc/nginx/conf.d/default.conf
depends_on:
- mariadb
networks:
- app-network
mariadb:
image: mariadb:11.5
container_name: mariadb_db
restart: unless-stopped
environment:
MYSQL_ROOT_PASSWORD: root_password
MYSQL_DATABASE: app_database
MYSQL_USER: app_user
MYSQL_PASSWORD: app_password
ports:
- "3307:3306"
volumes:
- mariadb_data:/var/lib/mysql
- ./init.sql:/docker-entrypoint-initdb.d/init.sql
networks:
- app-network
phpmyadmin:
image: phpmyadmin:latest
container_name: phpmyadmin
restart: unless-stopped
environment:
PMA_HOST: mariadb
PMA_PORT: 3306
MYSQL_ROOT_PASSWORD: root_password
ports:
- "8081:80"
depends_on:
- mariadb
networks:
- app-network
networks:
app-network:
driver: bridge
volumes:
mariadb_data:
driver: local

66
init.sql Normal file
View File

@@ -0,0 +1,66 @@
-- Database initialization script for Web Crawler
-- Crawl Jobs Table
CREATE TABLE IF NOT EXISTS crawl_jobs (
id INT AUTO_INCREMENT PRIMARY KEY,
domain VARCHAR(255) NOT NULL,
status ENUM('pending', 'running', 'completed', 'failed') DEFAULT 'pending',
total_pages INT DEFAULT 0,
total_links INT DEFAULT 0,
started_at TIMESTAMP NULL,
completed_at TIMESTAMP NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
INDEX idx_domain (domain),
INDEX idx_status (status)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- Pages Table
CREATE TABLE IF NOT EXISTS pages (
id INT AUTO_INCREMENT PRIMARY KEY,
crawl_job_id INT NOT NULL,
url VARCHAR(2048) NOT NULL,
title VARCHAR(500),
status_code INT,
content_type VARCHAR(100),
crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE,
INDEX idx_crawl_job (crawl_job_id),
INDEX idx_url (url(255)),
UNIQUE KEY unique_job_url (crawl_job_id, url(255))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- Links Table
CREATE TABLE IF NOT EXISTS links (
id INT AUTO_INCREMENT PRIMARY KEY,
page_id INT NOT NULL,
crawl_job_id INT NOT NULL,
source_url VARCHAR(2048) NOT NULL,
target_url VARCHAR(2048) NOT NULL,
link_text VARCHAR(1000),
is_nofollow BOOLEAN DEFAULT FALSE,
is_internal BOOLEAN DEFAULT TRUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (page_id) REFERENCES pages(id) ON DELETE CASCADE,
FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE,
INDEX idx_page (page_id),
INDEX idx_crawl_job (crawl_job_id),
INDEX idx_source_url (source_url(255)),
INDEX idx_target_url (target_url(255)),
INDEX idx_nofollow (is_nofollow)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- Queue Table for parallel processing
CREATE TABLE IF NOT EXISTS crawl_queue (
id INT AUTO_INCREMENT PRIMARY KEY,
crawl_job_id INT NOT NULL,
url VARCHAR(2048) NOT NULL,
depth INT DEFAULT 0,
status ENUM('pending', 'processing', 'completed', 'failed') DEFAULT 'pending',
retry_count INT DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
processed_at TIMESTAMP NULL,
FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE,
INDEX idx_status (status),
INDEX idx_crawl_job (crawl_job_id),
UNIQUE KEY unique_job_url (crawl_job_id, url(255))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

21
phpunit.xml Normal file
View File

@@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/11.0/phpunit.xsd"
bootstrap="vendor/autoload.php"
colors="true"
cacheDirectory=".phpunit.cache"
testdox="true">
<testsuites>
<testsuite name="Unit Tests">
<directory>tests/Unit</directory>
</testsuite>
<testsuite name="Integration Tests">
<directory>tests/Integration</directory>
</testsuite>
</testsuites>
<source>
<include>
<directory>src/classes</directory>
</include>
</source>
</phpunit>

128
src/api.php Normal file
View File

@@ -0,0 +1,128 @@
<?php
require_once __DIR__ . '/vendor/autoload.php';
use App\Database;
use App\Crawler;
header('Content-Type: application/json');
$db = Database::getInstance();
$action = $_GET['action'] ?? '';
try {
switch ($action) {
case 'start':
$domain = $_POST['domain'] ?? '';
if (empty($domain)) {
throw new Exception('Domain is required');
}
// Validate and format URL
if (!preg_match('/^https?:\/\//', $domain)) {
$domain = 'https://' . $domain;
}
// Create crawl job
$stmt = $db->prepare("INSERT INTO crawl_jobs (domain, status) VALUES (?, 'pending')");
$stmt->execute([$domain]);
$jobId = $db->lastInsertId();
// Start crawling in background (using exec for async)
$cmd = "php " . __DIR__ . "/crawler-worker.php $jobId > /dev/null 2>&1 &";
exec($cmd);
echo json_encode([
'success' => true,
'job_id' => $jobId,
'message' => 'Crawl job started'
]);
break;
case 'status':
$jobId = $_GET['job_id'] ?? 0;
$stmt = $db->prepare("SELECT * FROM crawl_jobs WHERE id = ?");
$stmt->execute([$jobId]);
$job = $stmt->fetch();
if (!$job) {
throw new Exception('Job not found');
}
// Get queue statistics
$stmt = $db->prepare("
SELECT
COUNT(*) as total,
SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END) as pending,
SUM(CASE WHEN status = 'processing' THEN 1 ELSE 0 END) as processing,
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed
FROM crawl_queue
WHERE crawl_job_id = ?
");
$stmt->execute([$jobId]);
$queueStats = $stmt->fetch();
echo json_encode([
'success' => true,
'job' => $job,
'queue' => $queueStats
]);
break;
case 'jobs':
$stmt = $db->query("SELECT * FROM crawl_jobs ORDER BY created_at DESC LIMIT 50");
$jobs = $stmt->fetchAll();
echo json_encode([
'success' => true,
'jobs' => $jobs
]);
break;
case 'pages':
$jobId = $_GET['job_id'] ?? 0;
$stmt = $db->prepare("SELECT * FROM pages WHERE crawl_job_id = ? ORDER BY id DESC LIMIT 1000");
$stmt->execute([$jobId]);
$pages = $stmt->fetchAll();
echo json_encode([
'success' => true,
'pages' => $pages
]);
break;
case 'links':
$jobId = $_GET['job_id'] ?? 0;
$stmt = $db->prepare("SELECT * FROM links WHERE crawl_job_id = ? ORDER BY id DESC LIMIT 1000");
$stmt->execute([$jobId]);
$links = $stmt->fetchAll();
echo json_encode([
'success' => true,
'links' => $links
]);
break;
case 'delete':
$jobId = $_POST['job_id'] ?? 0;
$stmt = $db->prepare("DELETE FROM crawl_jobs WHERE id = ?");
$stmt->execute([$jobId]);
echo json_encode([
'success' => true,
'message' => 'Job deleted'
]);
break;
default:
throw new Exception('Invalid action');
}
} catch (Exception $e) {
http_response_code(400);
echo json_encode([
'success' => false,
'error' => $e->getMessage()
]);
}

286
src/classes/Crawler.php Normal file
View File

@@ -0,0 +1,286 @@
<?php
namespace App;
use GuzzleHttp\Client;
use GuzzleHttp\Pool;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Exception\RequestException;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
class Crawler {
private \PDO $db;
private Client $client;
private int $concurrency = 10; // Parallel requests
private array $visited = [];
private int $crawlJobId;
private string $baseDomain;
public function __construct(int $crawlJobId) {
$this->db = Database::getInstance();
$this->crawlJobId = $crawlJobId;
$this->client = new Client([
'timeout' => 30,
'verify' => false,
'headers' => [
'User-Agent' => 'WebCrawler/1.0'
]
]);
}
public function start(string $startUrl): void {
$this->baseDomain = strtolower(parse_url($startUrl, PHP_URL_HOST));
// Update job status
$stmt = $this->db->prepare("UPDATE crawl_jobs SET status = 'running', started_at = NOW() WHERE id = ?");
$stmt->execute([$this->crawlJobId]);
// Normalize and add start URL to queue
$normalizedStartUrl = $this->normalizeUrl($startUrl);
$this->addToQueue($normalizedStartUrl, 0);
// Process queue
$this->processQueue();
// Update job status
$this->updateJobStats();
$stmt = $this->db->prepare("UPDATE crawl_jobs SET status = 'completed', completed_at = NOW() WHERE id = ?");
$stmt->execute([$this->crawlJobId]);
}
private function addToQueue(string $url, int $depth): void {
if (isset($this->visited[$url])) {
return;
}
try {
$stmt = $this->db->prepare(
"INSERT IGNORE INTO crawl_queue (crawl_job_id, url, depth) VALUES (?, ?, ?)"
);
$stmt->execute([$this->crawlJobId, $url, $depth]);
} catch (\Exception $e) {
// URL already in queue
}
}
private function processQueue(): void {
while (true) {
// Get pending URLs
$stmt = $this->db->prepare(
"SELECT id, url, depth FROM crawl_queue
WHERE crawl_job_id = ? AND status = 'pending'
LIMIT ?"
);
$stmt->execute([$this->crawlJobId, $this->concurrency]);
$urls = $stmt->fetchAll();
if (empty($urls)) {
break;
}
$this->crawlBatch($urls);
}
}
private function crawlBatch(array $urls): void {
$requests = function() use ($urls) {
foreach ($urls as $item) {
// Mark as processing
$stmt = $this->db->prepare("UPDATE crawl_queue SET status = 'processing' WHERE id = ?");
$stmt->execute([$item['id']]);
yield function() use ($item) {
return $this->client->getAsync($item['url']);
};
}
};
$pool = new Pool($this->client, $requests(), [
'concurrency' => $this->concurrency,
'fulfilled' => function ($response, $index) use ($urls) {
$item = $urls[$index];
$this->handleResponse($item, $response);
},
'rejected' => function ($reason, $index) use ($urls) {
$item = $urls[$index];
$this->handleError($item, $reason);
},
]);
$pool->promise()->wait();
}
private function handleResponse(array $queueItem, $response): void {
$url = $queueItem['url'];
$depth = $queueItem['depth'];
$this->visited[$url] = true;
$statusCode = $response->getStatusCode();
$contentType = $response->getHeaderLine('Content-Type');
$body = $response->getBody()->getContents();
// Save page
$domCrawler = new DomCrawler($body, $url);
$title = $domCrawler->filter('title')->count() > 0
? $domCrawler->filter('title')->text()
: '';
$stmt = $this->db->prepare(
"INSERT INTO pages (crawl_job_id, url, title, status_code, content_type)
VALUES (?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code)"
);
$stmt->execute([$this->crawlJobId, $url, $title, $statusCode, $contentType]);
$pageId = $this->db->lastInsertId();
// If pageId is 0, fetch it manually
if ($pageId == 0) {
$stmt = $this->db->prepare("SELECT id FROM pages WHERE crawl_job_id = ? AND url = ?");
$stmt->execute([$this->crawlJobId, $url]);
$pageId = $stmt->fetchColumn();
}
// Extract and save links
if (str_contains($contentType, 'text/html')) {
echo "Extracting links from: $url (pageId: $pageId)\n";
$this->extractLinks($domCrawler, $url, $pageId, $depth);
} else {
echo "Skipping link extraction - content type: $contentType\n";
}
// Mark as completed
$stmt = $this->db->prepare("UPDATE crawl_queue SET status = 'completed', processed_at = NOW() WHERE id = ?");
$stmt->execute([$queueItem['id']]);
}
private function extractLinks(DomCrawler $crawler, string $sourceUrl, int $pageId, int $depth): void {
$linkCount = 0;
$crawler->filter('a')->each(function (DomCrawler $node) use ($sourceUrl, $pageId, $depth, &$linkCount) {
try {
$linkCount++;
$href = $node->attr('href');
if (!$href || $href === '#') {
return;
}
// Convert relative URLs to absolute
$targetUrl = $this->makeAbsoluteUrl($href, $sourceUrl);
// Get link text
$linkText = trim($node->text());
// Check nofollow
$rel = $node->attr('rel') ?? '';
$isNofollow = str_contains($rel, 'nofollow');
// Check if internal (same domain, no subdomains)
$targetDomain = strtolower(parse_url($targetUrl, PHP_URL_HOST) ?? '');
$isInternal = ($targetDomain === $this->baseDomain);
// Save link
$stmt = $this->db->prepare(
"INSERT INTO links (page_id, crawl_job_id, source_url, target_url, link_text, is_nofollow, is_internal)
VALUES (?, ?, ?, ?, ?, ?, ?)"
);
$stmt->execute([
$pageId,
$this->crawlJobId,
$sourceUrl,
$targetUrl,
$linkText,
$isNofollow ? 1 : 0,
$isInternal ? 1 : 0
]);
// Add to queue if internal and not nofollow
if ($isInternal && !$isNofollow && $depth < 50) {
// Normalize URL (remove fragment, trailing slash)
$normalizedUrl = $this->normalizeUrl($targetUrl);
$this->addToQueue($normalizedUrl, $depth + 1);
}
} catch (\Exception $e) {
echo "Error processing link: " . $e->getMessage() . "\n";
}
});
echo "Processed $linkCount links from $sourceUrl\n";
}
private function makeAbsoluteUrl(string $url, string $base): string {
if (filter_var($url, FILTER_VALIDATE_URL)) {
return $url;
}
$parts = parse_url($base);
$scheme = $parts['scheme'] ?? 'http';
$host = $parts['host'] ?? '';
$path = $parts['path'] ?? '/';
if ($url[0] === '/') {
return "$scheme://$host$url";
}
$basePath = substr($path, 0, strrpos($path, '/') + 1);
return "$scheme://$host$basePath$url";
}
private function handleError(array $queueItem, $reason): void {
$stmt = $this->db->prepare(
"UPDATE crawl_queue SET status = 'failed', processed_at = NOW(), retry_count = retry_count + 1 WHERE id = ?"
);
$stmt->execute([$queueItem['id']]);
}
private function updateJobStats(): void {
$stmt = $this->db->prepare(
"UPDATE crawl_jobs SET
total_pages = (SELECT COUNT(*) FROM pages WHERE crawl_job_id = ?),
total_links = (SELECT COUNT(*) FROM links WHERE crawl_job_id = ?)
WHERE id = ?"
);
$stmt->execute([$this->crawlJobId, $this->crawlJobId, $this->crawlJobId]);
}
private function normalizeUrl(string $url): string {
// Parse URL
$parts = parse_url($url);
if (!$parts) {
return $url;
}
// Remove fragment
unset($parts['fragment']);
// Normalize domain (add www if base domain has it, or remove if base doesn't)
if (isset($parts['host'])) {
// Always convert to lowercase
$parts['host'] = strtolower($parts['host']);
// Match www pattern with base domain
$baseHasWww = str_starts_with($this->baseDomain, 'www.');
$urlHasWww = str_starts_with($parts['host'], 'www.');
if ($baseHasWww && !$urlHasWww) {
$parts['host'] = 'www.' . $parts['host'];
} elseif (!$baseHasWww && $urlHasWww) {
$parts['host'] = substr($parts['host'], 4);
}
}
// Normalize path - remove trailing slash except for root
if (isset($parts['path']) && $parts['path'] !== '/') {
$parts['path'] = rtrim($parts['path'], '/');
}
// Rebuild URL
$scheme = isset($parts['scheme']) ? $parts['scheme'] . '://' : '';
$host = $parts['host'] ?? '';
$port = isset($parts['port']) ? ':' . $parts['port'] : '';
$path = $parts['path'] ?? '/';
$query = isset($parts['query']) ? '?' . $parts['query'] : '';
return $scheme . $host . $port . $path . $query;
}
}

32
src/classes/Database.php Normal file
View File

@@ -0,0 +1,32 @@
<?php
namespace App;
use PDO;
use PDOException;
class Database {
private static ?PDO $instance = null;
private function __construct() {}
public static function getInstance(): PDO {
if (self::$instance === null) {
try {
self::$instance = new PDO(
"mysql:host=mariadb;dbname=app_database;charset=utf8mb4",
"app_user",
"app_password",
[
PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION,
PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC,
PDO::ATTR_EMULATE_PREPARES => false,
]
);
} catch (PDOException $e) {
throw new \Exception("Database connection failed: " . $e->getMessage());
}
}
return self::$instance;
}
}

27
src/composer.json Normal file
View File

@@ -0,0 +1,27 @@
{
"name": "web-crawler/app",
"description": "Web Crawler Application with Parallel Processing",
"type": "project",
"require": {
"php": "^8.3",
"guzzlehttp/guzzle": "^7.8",
"symfony/dom-crawler": "^7.0",
"symfony/css-selector": "^7.0"
},
"require-dev": {
"phpunit/phpunit": "^11.0"
},
"autoload": {
"psr-4": {
"App\\": "classes/"
}
},
"autoload-dev": {
"psr-4": {
"Tests\\": "tests/"
}
},
"scripts": {
"test": "phpunit"
}
}

2757
src/composer.lock generated Normal file

File diff suppressed because it is too large Load Diff

40
src/crawler-worker.php Normal file
View File

@@ -0,0 +1,40 @@
#!/usr/bin/env php
<?php
require_once __DIR__ . '/vendor/autoload.php';
use App\Database;
use App\Crawler;
if ($argc < 2) {
die("Usage: php crawler-worker.php <job_id>\n");
}
$jobId = (int)$argv[1];
try {
$db = Database::getInstance();
// Get job details
$stmt = $db->prepare("SELECT domain FROM crawl_jobs WHERE id = ?");
$stmt->execute([$jobId]);
$job = $stmt->fetch();
if (!$job) {
die("Job not found\n");
}
echo "Starting crawl for: {$job['domain']}\n";
$crawler = new Crawler($jobId);
$crawler->start($job['domain']);
echo "Crawl completed\n";
} catch (Exception $e) {
echo "Error: " . $e->getMessage() . "\n";
// Mark job as failed
$db = Database::getInstance();
$stmt = $db->prepare("UPDATE crawl_jobs SET status = 'failed' WHERE id = ?");
$stmt->execute([$jobId]);
}

479
src/index.php Normal file
View File

@@ -0,0 +1,479 @@
<!DOCTYPE html>
<html lang="de">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Web Crawler</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background: #ffe4e9;
padding: 20px;
}
.container {
max-width: 1400px;
margin: 0 auto;
}
h1 {
color: #2c3e50;
margin-bottom: 30px;
}
.card {
background: white;
border-radius: 8px;
padding: 25px;
margin-bottom: 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.input-group {
display: flex;
gap: 10px;
margin-bottom: 20px;
}
input[type="text"] {
flex: 1;
padding: 12px 16px;
border: 2px solid #e0e0e0;
border-radius: 6px;
font-size: 16px;
}
input[type="text"]:focus {
outline: none;
border-color: #3498db;
}
button {
padding: 12px 24px;
background: #3498db;
color: white;
border: none;
border-radius: 6px;
font-size: 16px;
cursor: pointer;
transition: background 0.3s;
}
button:hover {
background: #2980b9;
}
button:disabled {
background: #bdc3c7;
cursor: not-allowed;
}
.status {
display: inline-block;
padding: 4px 12px;
border-radius: 4px;
font-size: 12px;
font-weight: 600;
text-transform: uppercase;
}
.status.pending { background: #f39c12; color: white; }
.status.running { background: #3498db; color: white; }
.status.completed { background: #27ae60; color: white; }
.status.failed { background: #e74c3c; color: white; }
table {
width: 100%;
border-collapse: collapse;
}
th, td {
padding: 12px;
text-align: left;
border-bottom: 1px solid #ecf0f1;
}
th {
background: #f8f9fa;
font-weight: 600;
color: #2c3e50;
}
tr:hover {
background: #f8f9fa;
}
.tabs {
display: flex;
gap: 10px;
margin-bottom: 20px;
border-bottom: 2px solid #ecf0f1;
}
.tab {
padding: 12px 20px;
background: none;
border: none;
border-bottom: 3px solid transparent;
cursor: pointer;
color: #7f8c8d;
font-weight: 500;
}
.tab.active {
color: #3498db;
border-bottom-color: #3498db;
}
.tab-content {
display: none;
}
.tab-content.active {
display: block;
}
.stats {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 15px;
margin-top: 15px;
}
.stat-box {
background: #ecf0f1;
padding: 15px;
border-radius: 6px;
}
.stat-label {
font-size: 12px;
color: #7f8c8d;
text-transform: uppercase;
margin-bottom: 5px;
}
.stat-value {
font-size: 24px;
font-weight: 700;
color: #2c3e50;
}
.stat-sublabel {
font-size: 11px;
color: #95a5a6;
margin-top: 3px;
}
.nofollow {
color: #e74c3c;
font-weight: 600;
}
.external {
color: #3498db;
}
.loading {
text-align: center;
padding: 40px;
color: #7f8c8d;
}
.action-btn {
padding: 6px 12px;
font-size: 14px;
margin-right: 5px;
}
.url-cell {
max-width: 400px;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
</style>
</head>
<body>
<div class="container">
<h1>🕷️ Web Crawler</h1>
<div class="card">
<h2>Neue Domain crawlen</h2>
<div class="input-group">
<input type="text" id="domainInput" placeholder="example.com oder https://example.com" />
<button onclick="startCrawl()">Crawl starten</button>
</div>
</div>
<div class="card">
<h2>Crawl Jobs</h2>
<table id="jobsTable">
<thead>
<tr>
<th>ID</th>
<th>Domain</th>
<th>Status</th>
<th>Seiten</th>
<th>Links</th>
<th>Gestartet</th>
<th>Aktionen</th>
</tr>
</thead>
<tbody id="jobsBody">
<tr><td colspan="7" class="loading">Lade...</td></tr>
</tbody>
</table>
</div>
<div id="jobDetails" style="display: none;">
<div class="card">
<h2>Job Details: <span id="jobDomain"></span></h2>
<div class="stats" id="jobStats"></div>
<div class="tabs">
<button class="tab active" onclick="switchTab('pages')">Seiten</button>
<button class="tab" onclick="switchTab('links')">Links</button>
</div>
<div class="tab-content active" id="pages-tab">
<table>
<thead>
<tr>
<th>URL</th>
<th>Titel</th>
<th>Status</th>
<th>Gecrawlt</th>
</tr>
</thead>
<tbody id="pagesBody">
<tr><td colspan="4" class="loading">Keine Seiten gefunden</td></tr>
</tbody>
</table>
</div>
<div class="tab-content" id="links-tab">
<table>
<thead>
<tr>
<th>Von</th>
<th>Nach</th>
<th>Link-Text</th>
<th>Nofollow</th>
<th>Typ</th>
</tr>
</thead>
<tbody id="linksBody">
<tr><td colspan="5" class="loading">Keine Links gefunden</td></tr>
</tbody>
</table>
</div>
</div>
</div>
</div>
<script>
let currentJobId = null;
let refreshInterval = null;
async function startCrawl() {
const domain = document.getElementById('domainInput').value.trim();
if (!domain) {
alert('Bitte Domain eingeben');
return;
}
const formData = new FormData();
formData.append('domain', domain);
try {
const response = await fetch('/api.php?action=start', {
method: 'POST',
body: formData
});
const data = await response.json();
if (data.success) {
document.getElementById('domainInput').value = '';
loadJobs();
alert('Crawl gestartet! Job ID: ' + data.job_id);
} else {
alert('Fehler: ' + data.error);
}
} catch (e) {
alert('Fehler beim Starten: ' + e.message);
}
}
async function loadJobs() {
try {
const response = await fetch('/api.php?action=jobs');
const data = await response.json();
if (data.success) {
const tbody = document.getElementById('jobsBody');
tbody.innerHTML = data.jobs.map(job => `
<tr>
<td>${job.id}</td>
<td>${job.domain}</td>
<td><span class="status ${job.status}">${job.status}</span></td>
<td>${job.total_pages}</td>
<td>${job.total_links}</td>
<td>${job.started_at || '-'}</td>
<td>
<button class="action-btn" onclick="viewJob(${job.id})">Ansehen</button>
<button class="action-btn" onclick="deleteJob(${job.id})">Löschen</button>
</td>
</tr>
`).join('');
}
} catch (e) {
console.error('Fehler beim Laden der Jobs:', e);
}
}
async function viewJob(jobId) {
currentJobId = jobId;
document.getElementById('jobDetails').style.display = 'block';
// Start auto-refresh every 1 second
if (refreshInterval) clearInterval(refreshInterval);
loadJobDetails();
refreshInterval = setInterval(loadJobDetails, 1000);
}
async function loadJobDetails() {
if (!currentJobId) return;
try {
// Load job status
const statusResponse = await fetch(`/api.php?action=status&job_id=${currentJobId}`);
const statusData = await statusResponse.json();
if (statusData.success) {
const job = statusData.job;
const queue = statusData.queue;
document.getElementById('jobDomain').textContent = job.domain;
const queueInfo = queue ? `
<div class="stat-box">
<div class="stat-label">Warteschlange</div>
<div class="stat-value">${queue.pending || 0}</div>
<div class="stat-sublabel">noch zu crawlen</div>
</div>
<div class="stat-box">
<div class="stat-label">Verarbeitet</div>
<div class="stat-value">${queue.completed || 0}</div>
<div class="stat-sublabel">abgeschlossen</div>
</div>
` : '';
document.getElementById('jobStats').innerHTML = `
<div class="stat-box">
<div class="stat-label">Status</div>
<div class="stat-value"><span class="status ${job.status}">${job.status}</span></div>
</div>
<div class="stat-box">
<div class="stat-label">Seiten</div>
<div class="stat-value">${job.total_pages}</div>
</div>
<div class="stat-box">
<div class="stat-label">Links</div>
<div class="stat-value">${job.total_links}</div>
</div>
${queueInfo}
`;
// Stop refresh if completed or failed
if (job.status === 'completed' || job.status === 'failed') {
if (refreshInterval) {
clearInterval(refreshInterval);
refreshInterval = null;
}
}
}
// Load pages
const pagesResponse = await fetch(`/api.php?action=pages&job_id=${currentJobId}`);
const pagesData = await pagesResponse.json();
if (pagesData.success && pagesData.pages.length > 0) {
document.getElementById('pagesBody').innerHTML = pagesData.pages.map(page => `
<tr>
<td class="url-cell" title="${page.url}">${page.url}</td>
<td>${page.title || '-'}</td>
<td>${page.status_code}</td>
<td>${page.crawled_at}</td>
</tr>
`).join('');
}
// Load links
const linksResponse = await fetch(`/api.php?action=links&job_id=${currentJobId}`);
const linksData = await linksResponse.json();
if (linksData.success && linksData.links.length > 0) {
document.getElementById('linksBody').innerHTML = linksData.links.map(link => `
<tr>
<td class="url-cell" title="${link.source_url}">${link.source_url}</td>
<td class="url-cell" title="${link.target_url}">${link.target_url}</td>
<td>${link.link_text || '-'}</td>
<td>${link.is_nofollow ? '<span class="nofollow">Ja</span>' : 'Nein'}</td>
<td>${link.is_internal ? 'Intern' : '<span class="external">Extern</span>'}</td>
</tr>
`).join('');
}
// Update jobs table
loadJobs();
} catch (e) {
console.error('Fehler beim Laden der Details:', e);
}
}
async function deleteJob(jobId) {
if (!confirm('Job wirklich löschen?')) return;
const formData = new FormData();
formData.append('job_id', jobId);
try {
const response = await fetch('/api.php?action=delete', {
method: 'POST',
body: formData
});
const data = await response.json();
if (data.success) {
loadJobs();
if (currentJobId === jobId) {
document.getElementById('jobDetails').style.display = 'none';
currentJobId = null;
}
}
} catch (e) {
alert('Fehler beim Löschen: ' + e.message);
}
}
function switchTab(tab) {
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
event.target.classList.add('active');
document.getElementById(tab + '-tab').classList.add('active');
}
// Initial load
loadJobs();
setInterval(loadJobs, 5000);
</script>
</body>
</html>

7
start.sh Normal file
View File

@@ -0,0 +1,7 @@
#!/bin/bash
# Start PHP-FPM
php-fpm -D
# Start Nginx in foreground
nginx -g 'daemon off;'

View File

@@ -0,0 +1,66 @@
<?php
namespace Tests\Integration;
use PHPUnit\Framework\TestCase;
use App\Crawler;
use App\Database;
class CrawlerIntegrationTest extends TestCase
{
private int $testJobId;
private \PDO $db;
protected function setUp(): void
{
$this->db = Database::getInstance();
// Create a test job
$stmt = $this->db->prepare("INSERT INTO crawl_jobs (domain, status) VALUES (?, 'pending')");
$stmt->execute(['https://httpbin.org']);
$this->testJobId = $this->db->lastInsertId();
}
protected function tearDown(): void
{
// Clean up test data
$stmt = $this->db->prepare("DELETE FROM crawl_jobs WHERE id = ?");
$stmt->execute([$this->testJobId]);
}
public function testCrawlerUpdatesJobStatusToRunning(): void
{
$crawler = new Crawler($this->testJobId);
// Start crawl (will fail but should update status)
try {
$crawler->start('https://httpbin.org/html');
} catch (\Exception $e) {
// Expected to fail in test environment
}
$stmt = $this->db->prepare("SELECT status FROM crawl_jobs WHERE id = ?");
$stmt->execute([$this->testJobId]);
$job = $stmt->fetch();
// Status should be either 'running' or 'completed'
$this->assertContains($job['status'], ['running', 'completed', 'failed']);
}
public function testCrawlerCreatesQueueEntries(): void
{
$crawler = new Crawler($this->testJobId);
try {
$crawler->start('https://httpbin.org/html');
} catch (\Exception $e) {
// Expected to fail in test environment
}
$stmt = $this->db->prepare("SELECT COUNT(*) as count FROM crawl_queue WHERE crawl_job_id = ?");
$stmt->execute([$this->testJobId]);
$result = $stmt->fetch();
$this->assertGreaterThan(0, $result['count']);
}
}

View File

@@ -0,0 +1,48 @@
<?php
namespace Tests\Unit;
use PHPUnit\Framework\TestCase;
use App\Crawler;
use App\Database;
class CrawlerTest extends TestCase
{
private int $testJobId;
protected function setUp(): void
{
$db = Database::getInstance();
// Create a test job
$stmt = $db->prepare("INSERT INTO crawl_jobs (domain, status) VALUES (?, 'pending')");
$stmt->execute(['https://example.com']);
$this->testJobId = $db->lastInsertId();
}
protected function tearDown(): void
{
$db = Database::getInstance();
// Clean up test data
$stmt = $db->prepare("DELETE FROM crawl_jobs WHERE id = ?");
$stmt->execute([$this->testJobId]);
}
public function testCrawlerCanBeInstantiated(): void
{
$crawler = new Crawler($this->testJobId);
$this->assertInstanceOf(Crawler::class, $crawler);
}
public function testCrawlerCreatesJobWithCorrectStatus(): void
{
$db = Database::getInstance();
$stmt = $db->prepare("SELECT status FROM crawl_jobs WHERE id = ?");
$stmt->execute([$this->testJobId]);
$job = $stmt->fetch();
$this->assertEquals('pending', $job['status']);
}
}

View File

@@ -0,0 +1,57 @@
<?php
namespace Tests\Unit;
use PHPUnit\Framework\TestCase;
use App\Database;
use PDO;
class DatabaseTest extends TestCase
{
public function testGetInstanceReturnsPDO(): void
{
$db = Database::getInstance();
$this->assertInstanceOf(PDO::class, $db);
}
public function testGetInstanceReturnsSameInstance(): void
{
$db1 = Database::getInstance();
$db2 = Database::getInstance();
$this->assertSame($db1, $db2);
}
public function testDatabaseConnectionHasCorrectAttributes(): void
{
$db = Database::getInstance();
// Test error mode
$this->assertEquals(
PDO::ERRMODE_EXCEPTION,
$db->getAttribute(PDO::ATTR_ERRMODE)
);
// Test fetch mode
$this->assertEquals(
PDO::FETCH_ASSOC,
$db->getAttribute(PDO::ATTR_DEFAULT_FETCH_MODE)
);
}
public function testCanExecuteQuery(): void
{
$db = Database::getInstance();
$stmt = $db->query('SELECT 1 as test');
$result = $stmt->fetch();
$this->assertEquals(['test' => 1], $result);
}
public function testCanPrepareStatement(): void
{
$db = Database::getInstance();
$stmt = $db->prepare('SELECT ? as test');
$this->assertInstanceOf(\PDOStatement::class, $stmt);
}
}