Update paths for config/ directory structure
Adjusted all references to match new config/ structure: - docker/config/nginx/default.conf → config/nginx/default.conf - docker/init.sql → config/docker/init.sql - docker/start.sh → config/docker/start.sh Updated files: - docker-compose.yml: Updated volume mount paths - README.md: Updated project structure documentation New structure consolidates all configuration files under config/ for better organization and clarity. Tested and verified all services running correctly. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
66
config/docker/init.sql
Normal file
66
config/docker/init.sql
Normal file
@@ -0,0 +1,66 @@
|
||||
-- Database initialization script for Web Crawler
|
||||
|
||||
-- Crawl Jobs Table
|
||||
CREATE TABLE IF NOT EXISTS crawl_jobs (
|
||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
domain VARCHAR(255) NOT NULL,
|
||||
status ENUM('pending', 'running', 'completed', 'failed') DEFAULT 'pending',
|
||||
total_pages INT DEFAULT 0,
|
||||
total_links INT DEFAULT 0,
|
||||
started_at TIMESTAMP NULL,
|
||||
completed_at TIMESTAMP NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
INDEX idx_domain (domain),
|
||||
INDEX idx_status (status)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
|
||||
-- Pages Table
|
||||
CREATE TABLE IF NOT EXISTS pages (
|
||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
crawl_job_id INT NOT NULL,
|
||||
url VARCHAR(2048) NOT NULL,
|
||||
title VARCHAR(500),
|
||||
status_code INT,
|
||||
content_type VARCHAR(100),
|
||||
crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE,
|
||||
INDEX idx_crawl_job (crawl_job_id),
|
||||
INDEX idx_url (url(255)),
|
||||
UNIQUE KEY unique_job_url (crawl_job_id, url(255))
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
|
||||
-- Links Table
|
||||
CREATE TABLE IF NOT EXISTS links (
|
||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
page_id INT NOT NULL,
|
||||
crawl_job_id INT NOT NULL,
|
||||
source_url VARCHAR(2048) NOT NULL,
|
||||
target_url VARCHAR(2048) NOT NULL,
|
||||
link_text VARCHAR(1000),
|
||||
is_nofollow BOOLEAN DEFAULT FALSE,
|
||||
is_internal BOOLEAN DEFAULT TRUE,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (page_id) REFERENCES pages(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE,
|
||||
INDEX idx_page (page_id),
|
||||
INDEX idx_crawl_job (crawl_job_id),
|
||||
INDEX idx_source_url (source_url(255)),
|
||||
INDEX idx_target_url (target_url(255)),
|
||||
INDEX idx_nofollow (is_nofollow)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
|
||||
-- Queue Table for parallel processing
|
||||
CREATE TABLE IF NOT EXISTS crawl_queue (
|
||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
crawl_job_id INT NOT NULL,
|
||||
url VARCHAR(2048) NOT NULL,
|
||||
depth INT DEFAULT 0,
|
||||
status ENUM('pending', 'processing', 'completed', 'failed') DEFAULT 'pending',
|
||||
retry_count INT DEFAULT 0,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
processed_at TIMESTAMP NULL,
|
||||
FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE,
|
||||
INDEX idx_status (status),
|
||||
INDEX idx_crawl_job (crawl_job_id),
|
||||
UNIQUE KEY unique_job_url (crawl_job_id, url(255))
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
Reference in New Issue
Block a user