mirror of
https://github.com/m1ngsama/automa.git
synced 2026-02-08 06:24:05 +00:00
- Add QUICKSTART.md for 5-minute setup guide - Add CHEATSHEET.md for quick command reference - Add OPTIMIZATION_SUMMARY.md with complete architecture overview - Add detailed architecture documentation in docs/ - ARCHITECTURE.md: System design and component details - IMPLEMENTATION.md: Step-by-step implementation guide - architecture-recommendations.md: Component selection rationale - Add .env.example template for configuration Following KISS principles and Unix philosophy for self-hosted IaC platform.
21 KiB
21 KiB
Automa 实施指南
目录结构优化
推荐的项目结构
automa/
├── .env # 全局环境变量(加密存储)
├── .env.example # 环境变量模板
├── .gitignore
├── Makefile # 统一命令入口
├── config.sh # 中央配置
├── docker-compose.yml # 全局编排(可选)
│
├── bin/ # 全局脚本
│ ├── backup.sh
│ ├── healthcheck.sh
│ ├── deploy.sh # 新增:统一部署脚本
│ ├── rollback.sh # 新增:回滚脚本
│ └── lib/
│ ├── common.sh
│ └── secrets.sh # 新增:Secrets 管理
│
├── docs/ # 文档
│ ├── architecture.md
│ ├── deployment.md
│ ├── disaster-recovery.md # 新增:灾难恢复手册
│ └── troubleshooting.md
│
├── infrastructure/ # 新增:基础设施服务
│ ├── caddy/
│ │ ├── Caddyfile
│ │ ├── docker-compose.yml
│ │ └── data/
│ ├── monitoring/
│ │ ├── docker-compose.yml
│ │ ├── prometheus/
│ │ │ ├── prometheus.yml
│ │ │ └── rules/
│ │ ├── grafana/
│ │ │ ├── datasources.yml
│ │ │ └── dashboards/
│ │ └── loki/
│ │ └── loki-config.yml
│ ├── watchtower/
│ │ └── docker-compose.yml
│ ├── duplicati/
│ │ └── docker-compose.yml
│ └── fail2ban/
│ ├── docker-compose.yml
│ └── jail.d/
│
├── services/ # 应用服务(重命名)
│ ├── minecraft/
│ │ ├── docker-compose.yml
│ │ ├── .env
│ │ ├── scripts/
│ │ ├── configs/
│ │ ├── data/
│ │ └── mods/
│ ├── teamspeak/
│ │ ├── docker-compose.yml
│ │ └── .env
│ └── nextcloud/
│ ├── docker-compose.yml
│ └── .env
│
├── backups/ # 本地备份目录
│ ├── minecraft/
│ ├── teamspeak/
│ └── nextcloud/
│
├── secrets/ # 加密的 Secrets(不进 Git)
│ ├── .env.encrypted
│ └── keys/
│
└── tests/ # 新增:测试脚本
├── test-backup.sh
├── test-restore.sh
└── test-monitoring.sh
Docker Compose 最佳实践
1. 网络架构配置
# infrastructure/networks.yml
# 全局网络定义(可被所有服务引用)
networks:
# 公网网络(Caddy + 对外服务)
public:
name: automa_public
driver: bridge
ipam:
config:
- subnet: 172.20.0.0/16
labels:
com.automa.network: "public"
com.automa.description: "Public-facing services"
# 监控网络(仅内部)
monitoring:
name: automa_monitoring
driver: bridge
internal: true # 不允许访问外网
ipam:
config:
- subnet: 172.21.0.0/16
labels:
com.automa.network: "monitoring"
# Nextcloud 网络
nextcloud:
name: automa_nextcloud
driver: bridge
ipam:
config:
- subnet: 172.22.0.0/16
labels:
com.automa.network: "nextcloud"
# TeamSpeak 网络
teamspeak:
name: automa_teamspeak
driver: bridge
ipam:
config:
- subnet: 172.23.0.0/16
labels:
com.automa.network: "teamspeak"
使用方法:
# 创建网络
docker network create -d bridge --subnet 172.20.0.0/16 automa_public
docker network create -d bridge --subnet 172.21.0.0/16 --internal automa_monitoring
docker network create -d bridge --subnet 172.22.0.0/16 automa_nextcloud
docker network create -d bridge --subnet 172.23.0.0/16 automa_teamspeak
# 或在 Makefile 中
make network-create
2. Caddy 反向代理配置
infrastructure/caddy/docker-compose.yml
services:
caddy:
image: caddy:2.7-alpine
container_name: automa-caddy
restart: unless-stopped
networks:
- automa_public
- automa_nextcloud
- automa_monitoring
ports:
- "80:80"
- "443:443"
- "443:443/udp" # HTTP/3 (QUIC)
volumes:
- ./Caddyfile:/etc/caddy/Caddyfile:ro
- ./data:/data
- ./config:/config
- /var/log/caddy:/var/log/caddy
environment:
- ACME_AGREE=true
- DOMAIN=${DOMAIN:-example.com}
- NEXTCLOUD_HOST=nextcloud
- GRAFANA_HOST=grafana
labels:
- "com.automa.service=caddy"
- "com.automa.category=infrastructure"
- "com.centurylinklabs.watchtower.enable=true"
healthcheck:
test: ["CMD", "caddy", "validate", "--config", "/etc/caddy/Caddyfile"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
labels: "com.automa.service"
networks:
automa_public:
external: true
automa_nextcloud:
external: true
automa_monitoring:
external: true
infrastructure/caddy/Caddyfile
# 全局配置
{
email admin@{$DOMAIN}
admin off # 禁用管理 API(生产环境)
# 日志配置
log {
output file /var/log/caddy/access.log {
roll_size 100mb
roll_keep 5
}
format json
}
}
# Nextcloud
cloud.{$DOMAIN} {
# HSTS
header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
# 安全头
header X-Content-Type-Options "nosniff"
header X-Frame-Options "SAMEORIGIN"
header X-XSS-Protection "1; mode=block"
header Referrer-Policy "strict-origin-when-cross-origin"
# Nextcloud 特殊配置
header {
-X-Powered-By
-Server
}
# 反向代理
reverse_proxy nextcloud:80 {
header_up X-Forwarded-Proto {scheme}
header_up X-Real-IP {remote_host}
header_up X-Forwarded-For {remote_host}
header_up X-Forwarded-Host {host}
}
# 大文件上传
request_body {
max_size 10GB
}
# 访问日志
log {
output file /var/log/caddy/nextcloud-access.log {
roll_size 50mb
roll_keep 3
}
}
# gzip 压缩
encode gzip
# 文件服务器缓存
@static {
path *.js *.css *.png *.jpg *.jpeg *.gif *.ico *.woff *.woff2
}
header @static Cache-Control "public, max-age=31536000, immutable"
}
# Grafana 监控面板
grafana.{$DOMAIN} {
# 仅允许本地网络访问(可选)
@local {
remote_ip 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16
}
# 如果需要公网访问,添加基本认证
basicauth {
admin $2a$14$Zkx19XLiW6VYouLHR5NmfOFU0z2GTNmpkT/5qqR7hx4wHAiH9lT4O # 密码:changeme
}
reverse_proxy grafana:3000
encode gzip
}
# Duplicati 备份管理(仅本地)
backup.{$DOMAIN} {
@local {
remote_ip 127.0.0.1 ::1 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16
}
handle @local {
reverse_proxy duplicati:8200
}
respond "Access Denied" 403
}
# 健康检查端点(不需要 SSL)
http://health.{$DOMAIN} {
respond "OK" 200
}
# 默认站点(404)
{$DOMAIN} {
respond "Automa Self-Hosted Services" 404
}
# 处理所有其他请求
http:// {
# 自动重定向到 HTTPS
redir https://{host}{uri} permanent
}
3. 监控栈配置
infrastructure/monitoring/docker-compose.yml
services:
# Prometheus 时序数据库
prometheus:
image: prom/prometheus:v2.48.1
container_name: automa-prometheus
restart: unless-stopped
networks:
- automa_monitoring
- automa_nextcloud
- automa_teamspeak
ports:
- "127.0.0.1:9090:9090" # 仅本地访问
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./prometheus/rules:/etc/prometheus/rules:ro
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d' # 保留 30 天
- '--storage.tsdb.retention.size=20GB' # 最大 20GB
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
labels:
- "com.automa.service=prometheus"
- "com.automa.category=monitoring"
- "com.centurylinklabs.watchtower.enable=false" # 手动更新
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
user: "65534:65534" # nobody 用户
# Grafana 可视化
grafana:
image: grafana/grafana:10.2.3
container_name: automa-grafana
restart: unless-stopped
networks:
- automa_monitoring
- automa_public
ports:
- "127.0.0.1:3000:3000"
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml:ro
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards:ro
- ./grafana/grafana.ini:/etc/grafana/grafana.ini:ro
environment:
- GF_SERVER_ROOT_URL=https://grafana.${DOMAIN:-example.com}
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-changeme}
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource
- GF_AUTH_ANONYMOUS_ENABLED=false
- GF_ANALYTICS_REPORTING_ENABLED=false
labels:
- "com.automa.service=grafana"
- "com.automa.category=monitoring"
- "com.centurylinklabs.watchtower.enable=true"
user: "472:472" # grafana 用户
# Loki 日志聚合
loki:
image: grafana/loki:2.9.3
container_name: automa-loki
restart: unless-stopped
networks:
- automa_monitoring
ports:
- "127.0.0.1:3100:3100"
volumes:
- ./loki/loki-config.yml:/etc/loki/loki-config.yml:ro
- loki-data:/loki
command: -config.file=/etc/loki/loki-config.yml
labels:
- "com.automa.service=loki"
- "com.automa.category=monitoring"
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3100/ready"]
interval: 30s
timeout: 10s
retries: 3
# Promtail 日志采集
promtail:
image: grafana/promtail:2.9.3
container_name: automa-promtail
restart: unless-stopped
networks:
- automa_monitoring
volumes:
- ./promtail/promtail-config.yml:/etc/promtail/promtail-config.yml:ro
- /var/log:/var/log:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
command: -config.file=/etc/promtail/promtail-config.yml
labels:
- "com.automa.service=promtail"
- "com.automa.category=monitoring"
# cAdvisor 容器监控
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.2
container_name: automa-cadvisor
restart: unless-stopped
networks:
- automa_monitoring
ports:
- "127.0.0.1:8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
privileged: true
devices:
- /dev/kmsg
labels:
- "com.automa.service=cadvisor"
- "com.automa.category=monitoring"
command:
- '--housekeeping_interval=30s'
- '--docker_only=true'
- '--disable_metrics=percpu,process,tcp,udp,diskIO,disk,network'
# Node Exporter 主机监控
node-exporter:
image: prom/node-exporter:v1.7.0
container_name: automa-node-exporter
restart: unless-stopped
networks:
- automa_monitoring
ports:
- "127.0.0.1:9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--path.rootfs=/rootfs'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
labels:
- "com.automa.service=node-exporter"
- "com.automa.category=monitoring"
networks:
automa_monitoring:
external: true
automa_public:
external: true
automa_nextcloud:
external: true
automa_teamspeak:
external: true
volumes:
prometheus-data:
name: automa_prometheus_data
grafana-data:
name: automa_grafana_data
loki-data:
name: automa_loki_data
infrastructure/monitoring/prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'automa'
environment: 'production'
# 告警规则
rule_files:
- '/etc/prometheus/rules/*.yml'
# Alertmanager 配置(可选)
# alerting:
# alertmanagers:
# - static_configs:
# - targets: ['alertmanager:9093']
# 数据源
scrape_configs:
# Prometheus 自监控
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
labels:
service: 'prometheus'
# Node Exporter(宿主机)
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
labels:
service: 'node-exporter'
instance: 'automa-host'
# cAdvisor(容器)
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
labels:
service: 'cadvisor'
# Caddy Metrics(需要启用 metrics 插件)
- job_name: 'caddy'
static_configs:
- targets: ['caddy:2019']
labels:
service: 'caddy'
# Nextcloud Exporter(需要部署 nextcloud-exporter)
- job_name: 'nextcloud'
static_configs:
- targets: ['nextcloud-exporter:9205']
labels:
service: 'nextcloud'
# Minecraft Exporter(需要部署 minecraft-exporter)
- job_name: 'minecraft'
static_configs:
- targets: ['minecraft-exporter:9225']
labels:
service: 'minecraft'
# Docker 容器自动发现
- job_name: 'docker-containers'
docker_sd_configs:
- host: unix:///var/run/docker.sock
relabel_configs:
- source_labels: [__meta_docker_container_label_com_automa_service]
target_label: service
- source_labels: [__meta_docker_container_label_com_automa_category]
target_label: category
- source_labels: [__meta_docker_container_name]
target_label: container
infrastructure/monitoring/prometheus/rules/alerts.yml
groups:
- name: automa_alerts
interval: 30s
rules:
# 容器健康检查
- alert: ContainerDown
expr: up{job="docker-containers"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "容器 {{ $labels.container }} 已停止"
description: "服务 {{ $labels.service }} 的容器已停止超过 5 分钟"
# 内存使用率
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 10m
labels:
severity: warning
annotations:
summary: "内存使用率过高 ({{ $value | humanize }}%)"
description: "主机内存使用率超过 85%"
# 磁盘空间
- alert: DiskSpaceLow
expr: (1 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"})) * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "磁盘空间不足 (剩余 {{ $value | humanize }}%)"
description: "根分区磁盘使用率超过 80%"
# CPU 使用率
- alert: HighCPUUsage
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "CPU 使用率过高 ({{ $value | humanize }}%)"
description: "主机 CPU 使用率持续超过 80%"
# Nextcloud 健康检查
- alert: NextcloudDown
expr: up{service="nextcloud"} == 0
for: 3m
labels:
severity: critical
annotations:
summary: "Nextcloud 服务不可用"
description: "Nextcloud 服务已停止超过 3 分钟"
# Minecraft 玩家数(示例)
- alert: MinecraftHighLoad
expr: minecraft_players_online > 15
for: 5m
labels:
severity: info
annotations:
summary: "Minecraft 在线玩家过多"
description: "当前在线玩家数:{{ $value }}"
# 备份失败(需要自定义 Exporter)
- alert: BackupFailed
expr: automa_backup_last_success_timestamp < (time() - 86400 * 2)
for: 1h
labels:
severity: critical
annotations:
summary: "备份失败"
description: "服务 {{ $labels.service }} 超过 48 小时未成功备份"
4. Loki 配置
infrastructure/monitoring/loki/loki-config.yml
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
instance_addr: 127.0.0.1
kvstore:
store: inmemory
schema_config:
configs:
- from: 2023-01-01
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: /loki/boltdb-shipper-active
cache_location: /loki/boltdb-shipper-cache
cache_ttl: 24h
shared_store: filesystem
filesystem:
directory: /loki/chunks
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h # 7 天
retention_period: 30d # 保留 30 天
max_query_length: 721h # 30 天
chunk_store_config:
max_look_back_period: 30d
table_manager:
retention_deletes_enabled: true
retention_period: 30d
compactor:
working_directory: /loki/boltdb-shipper-compactor
shared_store: filesystem
compaction_interval: 10m
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150
infrastructure/monitoring/promtail/promtail-config.yml
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
# Docker 容器日志
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
relabel_configs:
- source_labels: ['__meta_docker_container_name']
regex: '/(.*)'
target_label: 'container'
- source_labels: ['__meta_docker_container_label_com_automa_service']
target_label: 'service'
- source_labels: ['__meta_docker_container_label_com_automa_category']
target_label: 'category'
pipeline_stages:
- docker: {}
- json:
expressions:
level: level
msg: message
- labels:
level:
- timestamp:
source: timestamp
format: RFC3339
# 系统日志
- job_name: system
static_configs:
- targets:
- localhost
labels:
job: varlogs
__path__: /var/log/*.log
# Caddy 访问日志
- job_name: caddy
static_configs:
- targets:
- localhost
labels:
job: caddy
__path__: /var/log/caddy/*.log
pipeline_stages:
- json:
expressions:
level: level
ts: ts
logger: logger
msg: msg
status: status
method: request.method
uri: request.uri
duration: duration
- labels:
level:
status:
method:
- timestamp:
source: ts
format: Unix
5. Grafana 配置
infrastructure/monitoring/grafana/datasources.yml
apiVersion: 1
datasources:
# Prometheus
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
jsonData:
timeInterval: 15s
queryTimeout: 60s
# Loki
- name: Loki
type: loki
access: proxy
url: http://loki:3100
editable: false
jsonData:
maxLines: 1000
derivedFields:
- datasourceUid: Prometheus
matcherRegex: "trace_id=(\\w+)"
name: TraceID
url: "$${__value.raw}"
infrastructure/monitoring/grafana/grafana.ini
[server]
domain = grafana.${DOMAIN}
root_url = https://grafana.${DOMAIN}
serve_from_sub_path = false
[security]
admin_user = admin
admin_password = ${GRAFANA_ADMIN_PASSWORD}
disable_gravatar = true
cookie_secure = true
cookie_samesite = strict
[auth]
disable_login_form = false
disable_signout_menu = false
[auth.anonymous]
enabled = false
[auth.basic]
enabled = true
[analytics]
reporting_enabled = false
check_for_updates = false
[log]
mode = console file
level = info
[paths]
provisioning = /etc/grafana/provisioning
[dashboards]
default_home_dashboard_path = /etc/grafana/provisioning/dashboards/home.json
待续...
下一部分将包括:
- Watchtower 自动更新配置
- Duplicati 备份配置
- Fail2ban 安全配置
- Secrets 管理
- Makefile 更新
- 部署脚本