cache-apt-pkgs-action/scripts/check_utf8.sh
awalsh128 07366a6d1e - Added CLAUDE.md guidance with preferences.
- Refactored README.md
- Added workflows for version export and management.
- Removed src directory, following Go best practices
- Added COMMANDS.md documentation

Saving the AI semi-slop for now with broken states to get a snapshot.
Too lazy to setup another chained repo.
2025-08-29 17:30:25 -07:00

84 lines
2.4 KiB
Bash
Executable file

#!/bin/bash
#==============================================================================
# check_utf8.sh
#==============================================================================
#
# DESCRIPTION:
# Script to check and validate UTF-8 encoding in text files.
# Identifies files that are not properly UTF-8 encoded and reports them.
# Skips binary files and common non-text file types.
#
# USAGE:
# ./scripts/check_utf8.sh [<file>...] [directory]
#
# OPTIONS:
# <file> One or more files to check
# <directory> A directory to scan for files
#
# DEPENDENCIES:
# - bash
# - file (for file type detection)
# - iconv (for encoding detection)
#==============================================================================
# Required tools
command -v file >/dev/null 2>&1 || {
echo "file command not found. Please install it."
exit 1
}
command -v iconv >/dev/null 2>&1 || {
echo "iconv command not found. Please install it."
exit 1
}
# Find all potential text files, excluding certain directories and files
find . -type f \
! -path "./.git/*" \
! -name "*.png" \
! -name "*.jpg" \
! -name "*.jpeg" \
! -name "*.gif" \
! -name "*.ico" \
! -name "*.bin" \
! -name "*.exe" \
! -name "*.dll" \
! -name "*.so" \
! -name "*.dylib" \
-exec file -i {} \; |
while read -r line; do
file_path=$(echo "$line" | cut -d: -f1)
mime_type=$(echo "$line" | cut -d: -f2)
# Skip non-text files
if [[ ! $mime_type =~ "text/" ]] && \
[[ ! $mime_type =~ "application/json" ]] && \
[[ ! $mime_type =~ "application/x-yaml" ]] && \
[[ $line == *"binary"* ]]; then
echo "⏭️ Skipping non-text file: $file_path ($mime_type)"
continue
fi
encoding=$(echo "$mime_type" | grep -oP "charset=\K[^ ]*" || echo "unknown")
# Skip if already UTF-8 or ASCII
if [[ $encoding == "utf-8" ]] || [[ $encoding == "us-ascii" ]]; then
echo "$file_path is already UTF-8"
continue
fi
echo "⚠️ Converting $file_path from $encoding to UTF-8"
# Create a temporary file for conversion
temp_file="${file_path}.tmp"
# Try to convert the file to UTF-8
if iconv -f "${encoding:-ISO-8859-1}" -t UTF-8 "$file_path" >"$temp_file" 2>/dev/null; then
mv "$temp_file" "$file_path"
echo "✓ Successfully converted $file_path to UTF-8"
else
rm -f "$temp_file"
echo "⚠️ File $file_path appears to be binary or already UTF-8"
fi
done