Mit folgendem Shell Script lassen sich die Apache Webserver-Logs aller Domains in Plesk automatisch prüfen und zusammenfassen. Dadurch kann man einen Überblick über Traffic und eventuelle Probleme bekommen, auch bei vielen Domains und großen Log-Files.
#!/bin/bash
# Define output files
output_file="/var/www/vhosts/_analyze_logs/traffic-pages.txt"
output_wordpress="/var/www/vhosts/_analyze_logs/traffic-wordpress.txt"
output_5xx="/var/www/vhosts/_analyze_logs/traffic-5xx.txt"
output_404="/var/www/vhosts/_analyze_logs/traffic-404.txt"
output_bots="/var/www/vhosts/_analyze_logs/traffic-bots.txt"
output_ips="/var/www/vhosts/_analyze_logs/traffic-ips.txt"
# Define temporary files
temp_file="/tmp/combined_traffic.txt"
temp_wordpress="/tmp/combined_wordpress.txt"
temp_5xx="/tmp/combined_5xx.txt"
temp_404="/tmp/combined_404.txt"
temp_bots="/tmp/combined_bots.txt"
# Default values
hours_back=24
num_entries=30
domain=""
mail_results=""
bots_min_count=500
ips_min_count=1000
log_file="access_ssl_log"
# Check if arguments are provided for hours back, number of entries, specific domain, bots_min_count, and log_file
if [ $# -gt 0 ]; then
hours_back=$1
fi
if [ $# -gt 1 ]; then
num_entries=$2
fi
if [ $# -gt 2 ]; then
domain=$3
fi
if [ $# -gt 3 ]; then
mail_results=$4
fi
if [ $# -gt 4 ]; then
bots_min_count=$5
fi
if [ $# -gt 4 ]; then
ips_min_count=$6
fi
if [ $# -gt 5 ]; then
log_file=$6
fi
# Start timing the script
start_time=$(date +%s%3N)
# Calculate the date and time threshold
date_threshold=$(date -d "-$hours_back hours" +"%d/%b/%Y:%H:%M:%S")
# Initialize the output files
echo "Top $num_entries Most Viewed Pages Across All Logs (Last $hours_back hours)" > "$output_file"
echo "Generated on: $(date +'%d/%b/%Y %H:%M:%S')" >> "$output_file"
echo "" >> "$output_file"
echo "Top $num_entries WordPress Related Traffic Across All Logs (Last $hours_back hours)" > "$output_wordpress"
echo "Generated on: $(date +'%d/%b/%Y %H:%M:%S')" >> "$output_wordpress"
echo "" >> "$output_wordpress"
echo "Top $num_entries 5xx Errors Across All Logs (Last $hours_back hours)" > "$output_5xx"
echo "Generated on: $(date +'%d/%b/%Y %H:%M:%S')" >> "$output_5xx"
echo "" >> "$output_5xx"
echo "Top $num_entries 404 Errors Across All Logs (Last $hours_back hours)" > "$output_404"
echo "Generated on: $(date +'%d/%b/%Y %H:%M:%S')" >> "$output_404"
echo "" >> "$output_404"
echo "Bot Traffic >= $bots_min_count Across All Logs (Last $hours_back hours)" > "$output_bots"
echo "Generated on: $(date +'%d/%b/%Y %H:%M:%S')" >> "$output_bots"
echo "" >> "$output_bots"
echo "Top $num_entries IP Addresses Across All Logs (Last $hours_back hours)" > "$output_ips"
echo "Generated on: $(date +'%d/%b/%Y %H:%M:%S')" >> "$output_ips"
echo "" >> "$output_ips"
# Process each log and combine results into temporary files
> "$temp_file"
> "$temp_wordpress"
> "$temp_5xx"
> "$temp_404"
# Determine the log files to process
if [ -n "$domain" ]; then
logs_to_process=(/var/www/vhosts/$domain/logs/$log_file)
else
logs_to_process=(/var/www/vhosts/*/logs/$log_file)
fi
# Define pattern for static files
static_files_pattern='\.(css|js|jpg|jpeg|png|gif|webp|ico|svg|woff|woff2|ttf|eot|pdf|zip|map|mp4|webm|mp3|wav|doc|docx|xls|xlsx|rar|tar|gz)(\?.*)?$'
# Define pattern for normal traffic
normal_traffic_pattern="$static_files_pattern|\/wp-content\/|\/wp-admin\/|\/wp-json\/|koko-analytics|wp-cron\.php|\/wp-includes\/|wc-ajax=|\/favicons\/|\/xmlrpc\.php|\/feed\/|robots\.txt|sitemap|wp-login\.php"
# Define pattern for WordPress traffic
wordpress_traffic_pattern="wp-cron\.php|\/wp-includes\/|wc-ajax=|\/xmlrpc\.php|wp-login\.php|\/wp-json\/|\/wp-admin\/|\/login\/|\/wp-content\/themes\/|\/wp-content\/plugins\/|\/feed\/|\/wp-comments-post\.php|\/trackback\/"
# Define bot search patterns
bot_pattern_include="bot|spider|crawler|slurp|bing|yandex|baidu|AdsBot-Google|Googlebot|Applebot|BingSapphire|Plesk screenshot bot|bingbot|Bytespider|DuckDuckBot|Xing Bot|YandexBot|Sogou Spider|Yahoo! Slurp|Facebot"
bot_pattern_exclude="pingdom|UptimeRobot|StatusCake|Site24x7|Uptime\.com|Monitis|Uptrends|Dotcom-Monitor|Updown\.io|Hetrix|NodePing"
for log in "${logs_to_process[@]}"; do
# Extract domain name
domain=$(basename "$(dirname "$(dirname "$log")")")
# Process normal traffic
awk -v date_threshold="$date_threshold" -F\" '$4 > date_threshold {print $2}' "$log" | \
awk -v domain="$domain" '{print domain $2}' | \
grep -v -E "$normal_traffic_pattern" >> "$temp_file"
# Process WordPress traffic
awk -v date_threshold="$date_threshold" -F\" '$4 > date_threshold {print $2}' "$log" | \
awk -v domain="$domain" '{print domain $2}' | \
grep -E "$wordpress_traffic_pattern" | grep -v -E "$static_files_pattern" >> "$temp_wordpress"
# Process 5xx errors
awk -v date_threshold="$date_threshold" -v domain="$domain" '$4 > date_threshold && $9 >= 500 && $9 < 600 {print domain $7}' "$log" >> "$temp_5xx"
# Process 404 errors
awk -v date_threshold="$date_threshold" -v domain="$domain" '$4 > date_threshold && $9 == 404 {print domain $7}' "$log" >> "$temp_404"
# Process bot traffic
# Store results in variable to check count
results=$(awk -v domain="$domain" -F\" '$4 > date_threshold {print domain $2}' "$log" | \
awk -F\" '{print $6}' "$log" | grep -i -E "$bot_pattern_include" | grep -i -v -E "$bot_pattern_exclude" | sort | uniq -c | sort -rn | awk -v min="$bots_min_count" '$1 >= min')
# Only output if results exist
if [ ! -z "$results" ]; then
echo "$domain" >> "$output_bots"
echo "$results" >> "$output_bots"
echo " " >> "$output_bots"
fi
# Process IP addresses
# Extract IP addresses and count occurrences
results=$(awk -v date_threshold="$date_threshold" '$4 > date_threshold {print $1}' "$log" | sort | uniq -c | sort -rn | awk -v min="$ips_min_count" '$1 >= min')
# Only output if results exist
if [ ! -z "$results" ]; then
echo "$domain" >> "$output_ips"
echo "$results" >> "$output_ips"
echo " " >> "$output_ips"
fi
done
# Aggregate and sort results for normal traffic
sort "$temp_file" | uniq -c | sort -rn | head -n "$num_entries" >> "$output_file"
# Aggregate and sort results for WordPress traffic
sort "$temp_wordpress" | uniq -c | sort -rn | head -n "$num_entries" >> "$output_wordpress"
# Aggregate and sort results for 5xx errors
sort "$temp_5xx" | uniq -c | sort -rn | head -n "$num_entries" >> "$output_5xx"
# Aggregate and sort results for 404 errors
sort "$temp_404" | uniq -c | sort -rn | head -n "$num_entries" >> "$output_404"
# Clean up temporary files
rm "$temp_file"
rm "$temp_wordpress"
rm "$temp_5xx"
rm "$temp_404"
# End timing the script
end_time=$(date +%s%3N)
execution_time=$((end_time - start_time))
echo "Results saved to:"
echo "- $output_file"
echo "- $output_wordpress"
echo "- $output_5xx"
echo "- $output_404"
echo "- $output_bots"
echo "- $output_ips"
echo "Script execution time: ${execution_time}ms"
Shell Script aktualisiert am 05.12.2024
Hierbei werden die Dateien traffic-pages.txt, traffic-5xx.txt, traffic-404.txt, traffic-bots.txt und traffic-ips.txt angelegt, welche die entsprechenden gefilterten Daten enthalten.