Automating Text Comparison with Scripts

Published: December 2024 | Category: Automation | Reading time: 18 minutes

When you need to compare many files or perform regular comparisons, automation can save significant time and reduce human error. This guide will show you how to automate text comparison tasks using various scripting approaches and integrate them into your development workflow.

Python Scripts for Text Comparison

Python's difflib module provides powerful diff functionality that you can use to build custom comparison scripts.

Basic File Comparison Script

import difflib
import sys

def compare_files(file1, file2):
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        lines1 = f1.readlines()
        lines2 = f2.readlines()
    
    diff = difflib.unified_diff(lines1, lines2, 
                               fromfile=file1, tofile=file2,
                               lineterm='')
    
    for line in diff:
        print(line)

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python compare.py file1 file2")
        sys.exit(1)
    
    compare_files(sys.argv[1], sys.argv[2])

Advanced Comparison with Statistics

import difflib
import os
from datetime import datetime

def detailed_compare(file1, file2):
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        lines1 = f1.readlines()
        lines2 = f2.readlines()
    
    # Calculate similarity ratio
    similarity = difflib.SequenceMatcher(None, lines1, lines2).ratio()
    
    # Generate diff
    diff = list(difflib.unified_diff(lines1, lines2, 
                                    fromfile=file1, tofile=file2,
                                    lineterm=''))
    
    # Count changes
    additions = sum(1 for line in diff if line.startswith('+') and not line.startswith('+++'))
    deletions = sum(1 for line in diff if line.startswith('-') and not line.startswith('---'))
    
    return {
        'similarity': similarity,
        'additions': additions,
        'deletions': deletions,
        'diff': diff
    }

def generate_report(file1, file2, output_file=None):
    result = detailed_compare(file1, file2)
    
    report = f"""
Text Comparison Report
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Files: {file1} vs {file2}

Summary:
- Similarity: {result['similarity']:.2%}
- Lines added: {result['additions']}
- Lines deleted: {result['deletions']}

Detailed Diff:
{chr(10).join(result['diff'])}
"""
    
    if output_file:
        with open(output_file, 'w') as f:
            f.write(report)
    else:
        print(report)
    
    return result

Batch File Comparison

import os
import glob
from pathlib import Path

def batch_compare(directory1, directory2, pattern="*.txt"):
    """Compare all files matching pattern in two directories"""
    files1 = glob.glob(os.path.join(directory1, pattern))
    files2 = glob.glob(os.path.join(directory2, pattern))
    
    results = {}
    
    for file1 in files1:
        filename = os.path.basename(file1)
        file2 = os.path.join(directory2, filename)
        
        if os.path.exists(file2):
            result = detailed_compare(file1, file2)
            results[filename] = result
        else:
            results[filename] = {'error': 'File not found in second directory'}
    
    return results

def generate_batch_report(results, output_file):
    with open(output_file, 'w') as f:
        f.write("Batch Comparison Report\n")
        f.write("=" * 50 + "\n\n")
        
        for filename, result in results.items():
            if 'error' in result:
                f.write(f"{filename}: {result['error']}\n")
            else:
                f.write(f"{filename}: {result['similarity']:.2%} similar\n")
                f.write(f"  - Added: {result['additions']} lines\n")
                f.write(f"  - Deleted: {result['deletions']} lines\n\n")

Shell Scripts

Unix/Linux command line tools provide powerful diff functionality for quick comparisons.

Basic Diff Script

#!/bin/bash

# Simple file comparison script
if [ $# -ne 2 ]; then
    echo "Usage: $0 file1 file2"
    exit 1
fi

file1="$1"
file2="$2"

if [ ! -f "$file1" ] || [ ! -f "$file2" ]; then
    echo "Error: One or both files do not exist"
    exit 1
fi

echo "Comparing $file1 and $file2..."
echo "=================================="

# Generate diff with context
diff -u "$file1" "$file2"

# Show summary statistics
echo ""
echo "Summary:"
echo "Lines in $file1: $(wc -l < "$file1")"
echo "Lines in $file2: $(wc -l < "$file2")"
echo "Differences: $(diff -u "$file1" "$file2" | grep -E '^[+-]' | wc -l)"

Advanced Shell Script with Options

#!/bin/bash

# Advanced diff script with options
usage() {
    echo "Usage: $0 [-c context] [-i] [-w] file1 file2"
    echo "  -c: Number of context lines (default: 3)"
    echo "  -i: Ignore case differences"
    echo "  -w: Ignore whitespace differences"
    exit 1
}

CONTEXT=3
IGNORE_CASE=""
IGNORE_WHITESPACE=""

while getopts "c:iw" opt; do
    case $opt in
        c) CONTEXT="$OPTARG" ;;
        i) IGNORE_CASE="-i" ;;
        w) IGNORE_WHITESPACE="-w" ;;
        *) usage ;;
    esac
done

shift $((OPTIND-1))

if [ $# -ne 2 ]; then
    usage
fi

file1="$1"
file2="$2"

# Build diff command
DIFF_CMD="diff -u -C $CONTEXT $IGNORE_CASE $IGNORE_WHITESPACE"

echo "Comparing files with $CONTEXT lines of context..."
$DIFF_CMD "$file1" "$file2"

# Generate statistics
echo ""
echo "=== Statistics ==="
echo "File 1: $file1 ($(wc -l < "$file1") lines)"
echo "File 2: $file2 ($(wc -l < "$file2") lines)"

# Count differences
DIFF_COUNT=$($DIFF_CMD "$file1" "$file2" | grep -E '^[+-]' | wc -l)
echo "Total differences: $DIFF_COUNT"

Directory Comparison Script

#!/bin/bash

# Compare all files in two directories
compare_directories() {
    dir1="$1"
    dir2="$2"
    pattern="${3:-*}"
    
    echo "Comparing directories: $dir1 and $dir2"
    echo "Pattern: $pattern"
    echo "======================================"
    
    # Find all files in both directories
    files1=$(find "$dir1" -name "$pattern" -type f | sort)
    files2=$(find "$dir2" -name "$pattern" -type f | sort)
    
    # Compare each file
    for file1 in $files1; do
        rel_path="${file1#$dir1/}"
        file2="$dir2/$rel_path"
        
        if [ -f "$file2" ]; then
            echo "Comparing: $rel_path"
            if diff -q "$file1" "$file2" > /dev/null; then
                echo "  ✓ Files are identical"
            else
                echo "  ✗ Files differ"
                diff -u "$file1" "$file2" | head -20
                echo "  ... (showing first 20 lines of diff)"
            fi
        else
            echo "  ! File not found in second directory: $rel_path"
        fi
        echo ""
    done
}

Git Hooks

Automate comparisons as part of your development workflow using Git hooks.

Pre-commit Hook for Code Style

#!/bin/bash

# Pre-commit hook to check for style issues
echo "Running pre-commit checks..."

# Check for trailing whitespace
if git diff --cached --check; then
    echo "✓ No trailing whitespace found"
else
    echo "✗ Trailing whitespace detected"
    echo "Please remove trailing whitespace and commit again"
    exit 1
fi

# Check for large files
MAX_SIZE=1048576  # 1MB
for file in $(git diff --cached --name-only); do
    if [ -f "$file" ]; then
        size=$(stat -c%s "$file")
        if [ $size -gt $MAX_SIZE ]; then
            echo "✗ Large file detected: $file ($size bytes)"
            echo "Consider using Git LFS for large files"
            exit 1
        fi
    fi
done

echo "✓ Pre-commit checks passed"
exit 0

Post-commit Hook for Documentation

#!/bin/bash

# Post-commit hook to generate diff documentation
COMMIT_HASH=$(git rev-parse HEAD)
PREV_COMMIT=$(git rev-parse HEAD~1)

# Create diff documentation
DIFF_FILE="docs/changes/commit_${COMMIT_HASH:0:8}.diff"
mkdir -p "$(dirname "$DIFF_FILE")"

echo "Generating diff documentation..."
git diff "$PREV_COMMIT" "$COMMIT_HASH" > "$DIFF_FILE"

# Generate summary
SUMMARY_FILE="docs/changes/commit_${COMMIT_HASH:0:8}_summary.txt"
{
    echo "Commit: $COMMIT_HASH"
    echo "Date: $(date)"
    echo "Author: $(git log -1 --format='%an <%ae>')"
    echo "Message: $(git log -1 --format='%s')"
    echo ""
    echo "Files changed:"
    git diff --name-only "$PREV_COMMIT" "$COMMIT_HASH"
    echo ""
    echo "Lines added: $(git diff --stat "$PREV_COMMIT" "$COMMIT_HASH" | tail -1)"
} > "$SUMMARY_FILE"

echo "✓ Documentation generated: $DIFF_FILE"

CI/CD Integration

Include text comparison in your automated testing and deployment pipeline.

GitHub Actions Workflow

name: Text Comparison Checks

on:
  pull_request:
    branches: [ main ]
  push:
    branches: [ main ]

jobs:
  compare:
    runs-on: ubuntu-latest
    
    steps:
    - uses: actions/checkout@v3
      with:
        fetch-depth: 2  # Need previous commit for comparison
    
    - name: Set up Python
      uses: actions/setup-python@v4
      with:
        python-version: '3.9'
    
    - name: Install dependencies
      run: |
        pip install difflib
    
    - name: Run comparison checks
      run: |
        python scripts/compare_changes.py
    
    - name: Generate diff report
      run: |
        python scripts/generate_report.py
        echo "## Diff Report" >> $GITHUB_STEP_SUMMARY
        cat diff_report.txt >> $GITHUB_STEP_SUMMARY
    
    - name: Upload diff artifacts
      uses: actions/upload-artifact@v3
      with:
        name: diff-reports
        path: |
          diff_report.txt
          *.diff

Jenkins Pipeline

pipeline {
    agent any
    
    stages {
        stage('Checkout') {
            steps {
                checkout scm
            }
        }
        
        stage('Compare Changes') {
            steps {
                script {
                    // Compare with previous version
                    sh '''
                        if [ -n "$CHANGE_ID" ]; then
                            # Pull request
                            git diff origin/main...HEAD > changes.diff
                        else
                            # Direct push
                            git diff HEAD~1...HEAD > changes.diff
                        fi
                        
                        # Analyze changes
                        python scripts/analyze_changes.py changes.diff
                    '''
                }
            }
        }
        
        stage('Generate Report') {
            steps {
                script {
                    sh '''
                        python scripts/generate_report.py
                        archiveArtifacts artifacts: '*.diff,*.txt', fingerprint: true
                    '''
                }
            }
        }
    }
    
    post {
        always {
            // Clean up
            sh 'rm -f *.diff *.txt'
        }
    }
}

Advanced Automation Techniques

Fuzzy Matching with Python

from fuzzywuzzy import fuzz
import pandas as pd

def fuzzy_compare_texts(text1, text2, threshold=80):
    """Compare texts using fuzzy matching"""
    similarity = fuzz.ratio(text1, text2)
    return similarity >= threshold, similarity

def batch_fuzzy_compare(file1, file2, output_file):
    """Compare lines in two files using fuzzy matching"""
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        lines1 = f1.readlines()
        lines2 = f2.readlines()
    
    results = []
    for i, line1 in enumerate(lines1):
        best_match = None
        best_score = 0
        
        for j, line2 in enumerate(lines2):
            score = fuzz.ratio(line1.strip(), line2.strip())
            if score > best_score:
                best_score = score
                best_match = j
        
        results.append({
            'line1_index': i,
            'line1_content': line1.strip(),
            'line2_index': best_match,
            'line2_content': lines2[best_match].strip() if best_match is not None else '',
            'similarity': best_score
        })
    
    # Save results to CSV
    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)
    return df

Real-time Monitoring

import time
import os
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler

class DiffMonitor(FileSystemEventHandler):
    def __init__(self, reference_file, output_dir):
        self.reference_file = reference_file
        self.output_dir = output_dir
        self.last_modified = os.path.getmtime(reference_file)
    
    def on_modified(self, event):
        if event.src_path == self.reference_file:
            current_time = time.time()
            if current_time - self.last_modified > 1:  # Debounce
                self.compare_files()
                self.last_modified = current_time
    
    def compare_files(self):
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        output_file = os.path.join(self.output_dir, f"diff_{timestamp}.txt")
        
        # Run comparison and save results
        result = detailed_compare(self.reference_file, f"{self.reference_file}.backup")
        
        with open(output_file, 'w') as f:
            f.write(f"Comparison at {timestamp}\n")
            f.write(f"Similarity: {result['similarity']:.2%}\n")
            f.write("".join(result['diff']))
        
        print(f"Comparison saved to {output_file}")

def start_monitoring(file_path, output_dir):
    event_handler = DiffMonitor(file_path, output_dir)
    observer = Observer()
    observer.schedule(event_handler, path=os.path.dirname(file_path), recursive=False)
    observer.start()
    
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()

Best Practices for Automation

Error handling: Always handle file not found and permission errors
Logging: Log all comparison activities for debugging
Performance: Use efficient algorithms for large files
Configuration: Make scripts configurable with command-line options
Documentation: Document your automation scripts
Testing: Test your scripts with various file types and sizes

Integration with Our Diff Tool

You can integrate our Easy Text Diff Tool into your automation workflow by:

Using it for quick manual verification of automated results
Sharing diff results with team members
Documenting changes in a visual format
Comparing outputs from different automation runs

Conclusion

Automating text comparison tasks can significantly improve your workflow efficiency and reduce errors. Start with simple scripts and gradually build more sophisticated automation as your needs grow. Remember to use our Easy Text Diff Tool for manual verification and sharing results with your team.