This commit is contained in:
2025-09-26 21:24:25 +02:00
commit b3e8f2ce85
4 changed files with 434 additions and 0 deletions

19
AGENTS.md Normal file
View File

@@ -0,0 +1,19 @@
# Repository Guidelines
## Project Structure & Module Organization
The codebase is intentionally lean. `index.php` bootstraps the crawl by instantiating `webanalyse` and handing off the crawl identifier. Core crawling logic lives in `webanalyse.php`, which houses HTTP fetching, link extraction, and database persistence. Use `setnew.php` to reset seed data inside the `screaming_frog` schema before a rerun. Keep new helpers in their own PHP files under this root so the autoload includes stay predictable; group SQL migrations or fixtures under a `database/` folder if you add them. IDE settings reside in `.idea/`.
## Build, Test, and Development Commands
Run the project through Apache in XAMPP or start the PHP built-in server with `php -S localhost:8080 index.php` from this directory. Validate syntax quickly via `php -l webanalyse.php` (repeat for any new file). When iterating on crawl logic, truncate runtime tables with `php setnew.php` to restore the baseline dataset.
## Coding Style & Naming Conventions
Follow PSR-12 style cues already in use: 4-space indentation, brace-on-new-line for functions, and `declare(strict_types=1);` at the top of entry scripts. Favour descriptive camelCase for methods (`getMultipleWebsites`) and snake_case only for direct SQL field names. Maintain `mysqli` usage for consistency, and gate new configuration through constants or clearly named environment variables.
## Testing Guidelines
There is no automated suite yet; treat each crawl as an integration test. After code changes, run `php setnew.php` followed by a crawl and confirm that `crawl`, `urls`, and `links` tables reflect the expected row counts. Log anomalies with `error_log()` while developing, and remove or downgrade to structured responses before merging.
## Commit & Pull Request Guidelines
Author commit messages in the present tense with a concise summary (`Add link grouping for external URLs`). Group related SQL adjustments with their PHP changes in the same commit. For pull requests, include: a short context paragraph, reproduction steps, screenshots of key output tables when behaviour changes, and any follow-up tasks. Link tracking tickets or issues so downstream agents can trace decisions.
## Security & Configuration Notes
Database credentials are currently hard-coded for local XAMPP usage. If you introduce environment-based configuration, document expected `.env` keys and ensure credentials are excluded from version control. Never commit production connection details or raw crawl exports.

13
index.php Normal file
View File

@@ -0,0 +1,13 @@
<?php
declare(strict_types=1);
Error_reporting(E_ALL);
ini_set('display_errors', 1);
require_once 'webanalyse.php';
$wa = new webanalyse();
$db = mysqli_connect("localhost", "root", "", "screaming_frog");
$wa-> doCrawl(1);

11
setnew.php Normal file
View File

@@ -0,0 +1,11 @@
<?php
$db = mysqli_connect("localhost", "root", "", "screaming_frog");
$db->query("truncate table crawl");
// $db->query("insert into crawl (start_url, user_id) values ('https://kies-media.de/', 1)");
$db->query("insert into crawl (start_url, user_id) values ('https://kies-media.de/leistungen/externer-ausbilder-fuer-fachinformatiker/', 1)");
$db->query("truncate table urls");
$urls = $db->query("insert ignore into urls (id, url, crawl_id) select 1,start_url, id from crawl where id = 1"); #->fetch_all(MYSQLI_ASSOC)
$db->query("truncate table links");

391
webanalyse.php Normal file
View File

@@ -0,0 +1,391 @@
<?php
class webanalyse
{
var $db;
function __construct()
{
$this->db = mysqli_connect("localhost", "root", "", "screaming_frog");
}
function getWebsite($url)
{
// cURL-Session initialisieren
$ch = curl_init();
// cURL-Optionen setzen
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); // Antwort als String zurückgeben
curl_setopt($ch, CURLOPT_HEADER, true); // Header in der Antwort einschließen
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // Weiterleitungen folgen
curl_setopt($ch, CURLOPT_TIMEOUT, 30); // Timeout nach 30 Sekunden
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); // User Agent setzen
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // SSL-Zertifikat nicht prüfen (nur für Tests)
// Anfrage ausführen
$response = curl_exec($ch);
// Fehler überprüfen
if (curl_errno($ch)) {
$error = curl_error($ch);
curl_close($ch);
return ['error' => $error];
}
// Informationen abrufen
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$totalTime = curl_getinfo($ch, CURLINFO_TOTAL_TIME);
$effectiveUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
// cURL-Session schließen
curl_close($ch);
// Header und Body trennen
$headers = substr($response, 0, $headerSize);
$body = substr($response, $headerSize);
// Header in Array umwandeln
$headerLines = explode("\r\n", trim($headers));
$parsedHeaders = [];
foreach ($headerLines as $line) {
if (strpos($line, ':') !== false) {
list($key, $value) = explode(':', $line, 2);
$parsedHeaders[trim($key)] = trim($value);
}
}
return [
'url' => $effectiveUrl,
'status_code' => $httpCode,
// 'headers_raw' => $headers,
'headers_parsed' => $parsedHeaders,
'body' => $body,
'response_time' => $totalTime,
'body_size' => strlen($body)
];
}
// Multi-cURL Funktion für mehrere URLs
function getMultipleWebsites($urls)
{
$results = [];
$curlHandles = [];
$multiHandle = curl_multi_init();
// Einzelne cURL-Handles für jede URL erstellen
foreach ($urls as $url) {
$ch = curl_init();
// cURL-Optionen setzen (gleich wie bei getWebsite)
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
// Handle zum Multi-Handle hinzufügen
curl_multi_add_handle($multiHandle, $ch);
$curlHandles[$url] = $ch;
}
// Alle Anfragen parallel ausführen
$running = null;
do {
curl_multi_exec($multiHandle, $running);
curl_multi_select($multiHandle);
} while ($running > 0);
// Ergebnisse verarbeiten
foreach ($urls as $url) {
$ch = $curlHandles[$url];
$response = curl_multi_getcontent($ch);
// Fehler überprüfen
if (curl_errno($ch)) {
$error = curl_error($ch);
$results[$url] = ['error' => $error];
} else {
// Informationen abrufen
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$totalTime = curl_getinfo($ch, CURLINFO_TOTAL_TIME);
$effectiveUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
// Header und Body trennen
$headers = substr($response, 0, $headerSize);
$body = substr($response, $headerSize);
// Header in Array umwandeln
$headerLines = explode("\r\n", trim($headers));
$parsedHeaders = [];
foreach ($headerLines as $line) {
if (strpos($line, ':') !== false) {
list($key, $value) = explode(':', $line, 2);
$parsedHeaders[trim($key)] = trim($value);
}
}
$results[$url] = [
'url' => $effectiveUrl,
'status_code' => $httpCode,
'headers_parsed' => $parsedHeaders,
'body' => $body,
'response_time' => $totalTime,
'body_size' => strlen($body)
];
}
// Handle aus Multi-Handle entfernen und schließen
curl_multi_remove_handle($multiHandle, $ch);
curl_close($ch);
}
// Multi-Handle schließen
curl_multi_close($multiHandle);
return $results;
}
function processResults(int $crawlID, string $url, array $data)
{
if (!isset($data['error'])) {
$status_code = $data['status_code'];
$response_time = $data['response_time'];
$body_size = $data['body_size'];
$date = date('Y-m-d H:i:s');
$body = $data['body'];
$sql = "UPDATE urls SET
status_code = " . $status_code . ",
response_time = " . ($response_time * 1000) . ",
body_size = " . $body_size . ",
date = now(),
body = '" . $this->db->real_escape_string($body) . "'
WHERE url = '" . $this->db->real_escape_string($url) . "' AND crawl_id = " . $crawlID . " LIMIT 1";
// echo $sql;
$this->db->query($sql);
} else {
// Handle error case if needed
echo "Fehler bei der Analyse von $url: " . $data['error'] . "\n";
}
$this->findNewUrls($crawlID, $body, $url);
}
function findNewUrls(int $crawlID, string $body, string $url) {
$links = $this->extractLinks($body, $url);
$temp = $this->db->query("select id from urls where url = '".$this->db->real_escape_string($url)."' and crawl_id = ".$crawlID." LIMIT 1")->fetch_all(MYSQLI_ASSOC);
$vonUrlId = $temp[0]['id'];
$this->db->query("delete from links where von = ".$vonUrlId);
foreach($links as $l) {
$u = $this->db->query("insert ignore into urls (url, crawl_id) values ('".$this->db->real_escape_string($l['absolute_url'])."',".$crawlID.")");
$id = $this->db->insert_id;
if ($id === 0) {
$qwer = $this->db->query("select id from urls where url = '".$this->db->real_escape_string($l['absolute_url'])."' and crawl_id = ".$crawlID." LIMIT 1")->fetch_all(MYSQLI_ASSOC);
$id = $qwer[0]['id'];
}
$sql_links = "insert ignore into links (von, nach, linktext, dofollow) values (
".$vonUrlId.",
".$id.",
'".$this->db->real_escape_string(mb_convert_encoding($l['text'],"UTF-8"))."',
".(strstr($l['rel']??"", 'nofollow') === false ? 1 : 0)."
)";
echo $sql_links;
$u = $this->db->query($sql_links);
}
print_r($links);
}
function doCrawl(int $crawlID)
{
$urls2toCrawl = $this->db->query("select * from urls where crawl_id = " . $crawlID . " and date is null LIMIT 2")->fetch_all(MYSQLI_ASSOC); // and date is not null
$urls = [];
foreach ($urls2toCrawl as $u) {
$urls[] = $u['url'];
}
$multipleResults = $this->getMultipleWebsites($urls);
// print_r($multipleResults);
foreach ($multipleResults as $url => $data) {
$this->processResults($crawlID, $url, $data);
}
}
function extractLinks($html, $baseUrl = '')
{
$links = [];
// DOMDocument erstellen und HTML laden
$dom = new DOMDocument();
// Fehlerbehandlung für ungültiges HTML
libxml_use_internal_errors(true);
$dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
libxml_clear_errors();
// Alle <a> Tags finden
$aTags = $dom->getElementsByTagName('a');
foreach ($aTags as $index => $aTag) {
$href = $aTag->getAttribute('href');
$text = trim($aTag->textContent);
$rel = $aTag->getAttribute('rel');
$title = $aTag->getAttribute('title');
$target = $aTag->getAttribute('target');
// Nur Links mit href-Attribut
if (!empty($href)) {
// Relative URLs zu absoluten URLs konvertieren
$absoluteUrl = $href;
if (!empty($baseUrl) && !preg_match('/^https?:\/\//', $href)) {
$absoluteUrl = rtrim($baseUrl, '/') . '/' . ltrim($href, '/');
}
$links[] = [
'index' => $index + 1,
'href' => $href,
'absolute_url' => $absoluteUrl,
'text' => $text,
'rel' => $rel ?: null,
'title' => $title ?: null,
'target' => $target ?: null,
'is_external' => $this->isExternalLink($href, $baseUrl),
'link_type' => $this->getLinkType($href),
'is_internal' => $this->isInternalLink($href, $baseUrl)?1:0
];
}
}
return $links;
}
/**
* Prüft ob ein Link extern ist
*/
private function isExternalLink($href, $baseUrl)
{
if (empty($baseUrl)) return null;
// Relative Links sind intern
if (!preg_match('/^https?:\/\//', $href)) {
return false;
}
$baseDomain = parse_url($baseUrl, PHP_URL_HOST);
$linkDomain = parse_url($href, PHP_URL_HOST);
return $baseDomain !== $linkDomain;
}
private function isInternalLink($href, $baseUrl)
{
if (empty($baseUrl)) return null;
// Relative Links sind intern
if (!preg_match('/^https?:\/\//', $href)) {
return true;
}
$baseDomain = parse_url($baseUrl, PHP_URL_HOST);
$linkDomain = parse_url($href, PHP_URL_HOST);
return $baseDomain === $linkDomain;
}
/**
* Bestimmt den Typ des Links
*/
private function getLinkType($href)
{
if (empty($href)) return 'empty';
if (strpos($href, 'mailto:') === 0) return 'email';
if (strpos($href, 'tel:') === 0) return 'phone';
if (strpos($href, '#') === 0) return 'anchor';
if (strpos($href, 'javascript:') === 0) return 'javascript';
if (preg_match('/^https?:\/\//', $href)) return 'absolute';
return 'relative';
}
/**
* Funktion zum Gruppieren der Links nach Typ
*/
function groupLinksByType($links)
{
$grouped = [];
foreach ($links as $link) {
$type = $link['link_type'];
if (!isset($grouped[$type])) {
$grouped[$type] = [];
}
$grouped[$type][] = $link;
}
return $grouped;
}
}