Skip to main content Brad's PyNotes

Glob Module: Unix-Style Pathname Pattern Matching

TL;DR

The glob module finds all pathnames matching a Unix shell-style pattern using wildcards like * (any characters), ? (single character), and [seq] (character ranges), making file discovery and batch operations simple.

Interesting!

The glob module gets its name from “global command” - the original Unix shell feature that expanded wildcards. Python’s implementation is more powerful than basic shell globbing, supporting recursive patterns with ** and advanced filtering options.

Basic Glob Patterns

python code snippet start

import glob
import os

# Create some test files for demonstration
test_files = [
    'data.txt', 'data.csv', 'backup.txt', 'image.png', 
    'document.pdf', 'script.py', 'config.json'
]

# Basic wildcard patterns
print("=== Basic Patterns ===")

# * matches any number of characters
txt_files = glob.glob('*.txt')
print(f"Text files: {txt_files}")

# ? matches exactly one character
single_char = glob.glob('data.???')
print(f"data.??? pattern: {single_char}")

# Character ranges and sets
brackets = glob.glob('[cd]*')  # Files starting with 'c' or 'd'
print(f"Files starting with c or d: {brackets}")

# Negation with !
not_txt = glob.glob('*.[!t]*')  # Files not ending with .t + one char
print(f"Not .txt files: {not_txt}")

# Multiple patterns
patterns = ['*.py', '*.txt', '*.json']
all_matches = []
for pattern in patterns:
    all_matches.extend(glob.glob(pattern))
print(f"Multiple patterns: {all_matches}")

python code snippet end

Advanced Pattern Matching

python code snippet start

import glob
from pathlib import Path

# Create directory structure for examples
def create_test_structure():
    """Create test directory structure."""
    dirs = [
        'project/src/main',
        'project/src/utils', 
        'project/tests/unit',
        'project/tests/integration',
        'project/docs/api',
        'project/data/raw',
        'project/data/processed'
    ]
    
    files = [
        'project/README.md',
        'project/setup.py',
        'project/src/main/app.py',
        'project/src/main/models.py',
        'project/src/utils/helpers.py',
        'project/src/utils/config.py',
        'project/tests/unit/test_models.py',
        'project/tests/unit/test_helpers.py',
        'project/tests/integration/test_app.py',
        'project/docs/api/reference.md',
        'project/data/raw/dataset.csv',
        'project/data/processed/clean_data.csv'
    ]
    
    # In practice, you'd create these with os.makedirs and open()
    return dirs, files

# Character classes and ranges
print("=== Character Classes ===")

# Range patterns
alpha_files = glob.glob('[a-z]*')  # Files starting with lowercase
print(f"Lowercase start: {alpha_files}")

numeric_files = glob.glob('*[0-9]*')  # Files containing digits
print(f"Contains digits: {numeric_files}")

# Complex character sets
complex_pattern = glob.glob('[!._]*[a-zA-Z]')  # Not starting with . or _, ending with letter
print(f"Complex pattern: {complex_pattern}")

# Case sensitivity (platform dependent)
case_files = glob.glob('[Dd]ata*')  # Files starting with 'Data' or 'data'
print(f"Case variants: {case_files}")

python code snippet end

Recursive Globbing

python code snippet start

import glob
import os

# Recursive pattern matching with **
print("=== Recursive Patterns ===")

# Find all Python files recursively
all_python = glob.glob('**/*.py', recursive=True)
print(f"All Python files: {all_python}")

# Find all files in subdirectories
all_files = glob.glob('**/*', recursive=True)
print(f"Total files found: {len(all_files)}")

# Specific directory patterns
test_files = glob.glob('**/tests/**/*.py', recursive=True)
print(f"Test files: {test_files}")

# Find configuration files anywhere
config_files = glob.glob('**/config.*', recursive=True)
print(f"Config files: {config_files}")

# Limit recursion depth (manual approach)
def glob_with_depth(pattern, max_depth=2):
    """Glob with maximum recursion depth."""
    results = []
    
    # Build patterns for each depth level
    for depth in range(max_depth + 1):
        depth_pattern = '/'.join(['*'] * depth) + '/' + pattern
        if depth == 0:
            depth_pattern = pattern
        
        matches = glob.glob(depth_pattern)
        results.extend(matches)
    
    return list(set(results))  # Remove duplicates

limited_search = glob_with_depth('*.py', max_depth=2)
print(f"Limited depth search: {limited_search}")

python code snippet end

Advanced Glob Functions

python code snippet start

import glob
import os
from pathlib import Path

# glob.iglob() - iterator version for large results
print("=== Iterator Globbing ===")

def process_large_directory():
    """Process large directories efficiently with iglob."""
    # Iterator doesn't load all results into memory at once
    for filepath in glob.iglob('**/*', recursive=True):
        if os.path.isfile(filepath):
            size = os.path.getsize(filepath)
            if size > 1024:  # Files larger than 1KB
                print(f"Large file: {filepath} ({size} bytes)")
            
            # Process one file at a time - memory efficient
            # break after a few for demo
            if filepath.endswith('.py'):
                break

# glob.escape() - escape special characters
print("\n=== Escaping Special Characters ===")

def safe_glob(filename_with_specials):
    """Safely glob filenames containing special characters."""
    # If you have files with [], *, ? in their names
    escaped = glob.escape(filename_with_specials)
    return glob.glob(escaped)

# Example: file named "data[2024].txt"
special_filename = "data[2024].txt"
escaped_pattern = glob.escape(special_filename)
print(f"Original: {special_filename}")
print(f"Escaped: {escaped_pattern}")

# glob with different working directory
def glob_in_directory(directory, pattern):
    """Glob in a specific directory."""
    old_cwd = os.getcwd()
    try:
        os.chdir(directory)
        results = glob.glob(pattern)
        # Convert to absolute paths
        return [os.path.abspath(f) for f in results]
    finally:
        os.chdir(old_cwd)

# Alternative: use pathlib for cleaner code
def pathlib_glob(directory, pattern):
    """Use pathlib for directory-specific globbing."""
    path = Path(directory)
    return list(path.glob(pattern))

# Example usage
if Path('project').exists():
    project_files = pathlib_glob('project', '**/*.py')
    print(f"Project Python files: {project_files}")

python code snippet end

File Type and Attribute Filtering

python code snippet start

import glob
import os
import stat
from datetime import datetime, timedelta

# Filter by file attributes
print("=== File Attribute Filtering ===")

def filter_by_size(pattern, min_size=0, max_size=float('inf')):
    """Filter glob results by file size."""
    results = []
    for filepath in glob.glob(pattern):
        if os.path.isfile(filepath):
            size = os.path.getsize(filepath)
            if min_size <= size <= max_size:
                results.append((filepath, size))
    return results

def filter_by_modification_time(pattern, days_old=7):
    """Filter files modified within specified days."""
    cutoff = datetime.now() - timedelta(days=days_old)
    results = []
    
    for filepath in glob.glob(pattern):
        if os.path.isfile(filepath):
            mtime = datetime.fromtimestamp(os.path.getmtime(filepath))
            if mtime >= cutoff:
                results.append((filepath, mtime))
    
    return results

def filter_by_permissions(pattern, permission_check):
    """Filter files by permissions."""
    results = []
    for filepath in glob.glob(pattern):
        if os.path.exists(filepath):
            mode = os.stat(filepath).st_mode
            if permission_check(mode):
                results.append(filepath)
    return results

# Usage examples
large_files = filter_by_size('**/*', min_size=1024, max_size=1024*1024)
print(f"Files 1KB-1MB: {len(large_files)}")

recent_files = filter_by_modification_time('**/*.py', days_old=30)
print(f"Python files modified in last 30 days: {len(recent_files)}")

# Check for executable files
executable_files = filter_by_permissions(
    '**/*', 
    lambda mode: bool(mode & stat.S_IXUSR)
)
print(f"Executable files: {len(executable_files)}")

# Readable files
readable_files = filter_by_permissions(
    '**/*',
    lambda mode: bool(mode & stat.S_IRUSR)
)
print(f"Readable files: {len(readable_files)}")

python code snippet end

Advanced Pattern Examples

python code snippet start

import glob
import re
from collections import defaultdict

# Complex real-world patterns
print("=== Real-World Patterns ===")

# Log file analysis
def find_log_files(date_pattern=None):
    """Find log files with optional date filtering."""
    patterns = [
        '**/*.log',
        '**/*log*',
        '**/logs/**/*',
        '**/*.log.*'  # Rotated logs
    ]
    
    all_logs = []
    for pattern in patterns:
        all_logs.extend(glob.glob(pattern, recursive=True))
    
    # Remove duplicates
    unique_logs = list(set(all_logs))
    
    if date_pattern:
        # Filter by date pattern in filename
        date_filtered = [
            log for log in unique_logs 
            if re.search(date_pattern, log)
        ]
        return date_filtered
    
    return unique_logs

# Find backup files
def find_backup_files():
    """Find various backup file patterns."""
    backup_patterns = [
        '**/*.bak',
        '**/*.backup',
        '**/*~',           # Emacs backups
        '**/*.orig',       # Original files
        '**/#*#',          # Emacs auto-save
        '**/.#*',          # More Emacs files
        '**/Backup of *',  # Windows backup pattern
        '**/*_backup_*',   # Custom backup pattern
        '**/*.old'
    ]
    
    backups = []
    for pattern in backup_patterns:
        backups.extend(glob.glob(pattern, recursive=True))
    
    return list(set(backups))

# Find media files by type
def categorize_media_files():
    """Categorize media files by type."""
    media_patterns = {
        'images': ['**/*.jpg', '**/*.jpeg', '**/*.png', '**/*.gif', '**/*.bmp', '**/*.svg'],
        'videos': ['**/*.mp4', '**/*.avi', '**/*.mov', '**/*.mkv', '**/*.wmv', '**/*.flv'],
        'audio': ['**/*.mp3', '**/*.wav', '**/*.flac', '**/*.ogg', '**/*.m4a', '**/*.aac'],
        'documents': ['**/*.pdf', '**/*.doc', '**/*.docx', '**/*.txt', '**/*.rtf']
    }
    
    categorized = defaultdict(list)
    
    for category, patterns in media_patterns.items():
        for pattern in patterns:
            files = glob.glob(pattern, recursive=True)
            categorized[category].extend(files)
    
    return dict(categorized)

# Find source code files
def find_source_files():
    """Find source code files by language."""
    languages = {
        'python': ['**/*.py', '**/*.pyw', '**/*.pyx'],
        'javascript': ['**/*.js', '**/*.jsx', '**/*.ts', '**/*.tsx'],
        'web': ['**/*.html', '**/*.htm', '**/*.css', '**/*.scss', '**/*.sass'],
        'c_cpp': ['**/*.c', '**/*.cpp', '**/*.h', '**/*.hpp', '**/*.cc'],
        'java': ['**/*.java', '**/*.class', '**/*.jar'],
        'config': ['**/*.json', '**/*.yaml', '**/*.yml', '**/*.xml', '**/*.ini', '**/*.cfg']
    }
    
    source_files = {}
    for lang, patterns in languages.items():
        files = []
        for pattern in patterns:
            files.extend(glob.glob(pattern, recursive=True))
        source_files[lang] = list(set(files))
    
    return source_files

# Usage examples
logs = find_log_files(r'\d{4}-\d{2}-\d{2}')  # Logs with date pattern
print(f"Date-stamped logs: {len(logs)}")

backups = find_backup_files()
print(f"Backup files found: {len(backups)}")

media = categorize_media_files()
for category, files in media.items():
    print(f"{category.title()}: {len(files)} files")

source = find_source_files()
for lang, files in source.items():
    print(f"{lang.title()}: {len(files)} files")

python code snippet end

Practical File Operations with Glob

python code snippet start

import glob
import os
import shutil
from pathlib import Path
import hashlib

# Batch file operations
print("=== Batch Operations ===")

def batch_rename(pattern, prefix="", suffix=""):
    """Batch rename files matching pattern."""
    files = glob.glob(pattern)
    renamed = []
    
    for filepath in files:
        directory = os.path.dirname(filepath)
        filename = os.path.basename(filepath)
        name, ext = os.path.splitext(filename)
        
        new_name = f"{prefix}{name}{suffix}{ext}"
        new_path = os.path.join(directory, new_name)
        
        try:
            os.rename(filepath, new_path)
            renamed.append((filepath, new_path))
        except OSError as e:
            print(f"Error renaming {filepath}: {e}")
    
    return renamed

def batch_copy(pattern, destination):
    """Copy all files matching pattern to destination."""
    files = glob.glob(pattern, recursive=True)
    os.makedirs(destination, exist_ok=True)
    
    copied = []
    for filepath in files:
        if os.path.isfile(filepath):
            filename = os.path.basename(filepath)
            dest_path = os.path.join(destination, filename)
            
            try:
                shutil.copy2(filepath, dest_path)
                copied.append((filepath, dest_path))
            except OSError as e:
                print(f"Error copying {filepath}: {e}")
    
    return copied

def find_duplicates(pattern):
    """Find duplicate files by content hash."""
    files = glob.glob(pattern, recursive=True)
    hash_map = defaultdict(list)
    
    for filepath in files:
        if os.path.isfile(filepath):
            try:
                with open(filepath, 'rb') as f:
                    file_hash = hashlib.md5(f.read()).hexdigest()
                    hash_map[file_hash].append(filepath)
            except OSError:
                continue
    
    # Return only groups with duplicates
    duplicates = {k: v for k, v in hash_map.items() if len(v) > 1}
    return duplicates

def cleanup_temp_files():
    """Clean up temporary files safely."""
    temp_patterns = [
        '**/*.tmp',
        '**/*.temp',
        '**/.*~',
        '**/#*#',
        '**/*.pyc',
        '**/__pycache__/**/*',
        '**/node_modules/**/*',
        '**/.DS_Store'
    ]
    
    cleaned = []
    for pattern in temp_patterns:
        files = glob.glob(pattern, recursive=True)
        for filepath in files:
            try:
                if os.path.isfile(filepath):
                    os.remove(filepath)
                    cleaned.append(filepath)
                elif os.path.isdir(filepath) and not os.listdir(filepath):
                    os.rmdir(filepath)  # Remove empty directories
                    cleaned.append(filepath)
            except OSError as e:
                print(f"Could not remove {filepath}: {e}")
    
    return cleaned

# File organization
def organize_files_by_type(source_pattern, base_dir="organized"):
    """Organize files by type into subdirectories."""
    file_types = {
        'documents': ['.pdf', '.doc', '.docx', '.txt', '.rtf'],
        'images': ['.jpg', '.jpeg', '.png', '.gif', '.bmp'],
        'videos': ['.mp4', '.avi', '.mov', '.mkv'],
        'audio': ['.mp3', '.wav', '.flac', '.ogg'],
        'archives': ['.zip', '.rar', '.tar', '.gz', '.7z'],
        'code': ['.py', '.js', '.html', '.css', '.cpp', '.java']
    }
    
    files = glob.glob(source_pattern)
    organized = defaultdict(list)
    
    for filepath in files:
        if not os.path.isfile(filepath):
            continue
            
        _, ext = os.path.splitext(filepath.lower())
        category = 'other'  # Default category
        
        # Find matching category
        for cat, extensions in file_types.items():
            if ext in extensions:
                category = cat
                break
        
        # Create category directory
        category_dir = os.path.join(base_dir, category)
        os.makedirs(category_dir, exist_ok=True)
        
        # Move file
        filename = os.path.basename(filepath)
        dest_path = os.path.join(category_dir, filename)
        
        try:
            shutil.move(filepath, dest_path)
            organized[category].append((filepath, dest_path))
        except OSError as e:
            print(f"Error moving {filepath}: {e}")
    
    return dict(organized)

# Example usage (be careful with file operations!)
print("Example file operations (disabled for safety):")
print("# batch_rename('*.txt', prefix='backup_')")
print("# batch_copy('**/*.py', 'python_files')")
print("# duplicates = find_duplicates('**/*')")
print("# cleanup_temp_files()")
print("# organize_files_by_type('Downloads/*')")

python code snippet end

Performance and Best Practices

python code snippet start

import glob
import time
from pathlib import Path
import fnmatch

# Performance comparison
print("=== Performance Tips ===")

def benchmark_glob_methods():
    """Compare different globbing approaches."""
    
    # Method 1: Basic glob
    start = time.time()
    result1 = glob.glob('**/*.py', recursive=True)
    time1 = time.time() - start
    
    # Method 2: pathlib
    start = time.time()
    result2 = list(Path('.').rglob('*.py'))
    time2 = time.time() - start
    
    # Method 3: os.walk with fnmatch
    start = time.time()
    result3 = []
    for root, dirs, files in os.walk('.'):
        for file in files:
            if fnmatch.fnmatch(file, '*.py'):
                result3.append(os.path.join(root, file))
    time3 = time.time() - start
    
    print(f"glob.glob: {time1:.4f}s ({len(result1)} files)")
    print(f"pathlib: {time2:.4f}s ({len(result2)} files)")
    print(f"os.walk: {time3:.4f}s ({len(result3)} files)")
    
    return result1, result2, result3

# Memory-efficient processing
def process_large_directory_efficiently():
    """Process large directories without loading all filenames."""
    
    # Use iglob for memory efficiency
    total_size = 0
    file_count = 0
    
    for filepath in glob.iglob('**/*', recursive=True):
        if os.path.isfile(filepath):
            total_size += os.path.getsize(filepath)
            file_count += 1
            
            # Process in chunks to avoid memory issues
            if file_count % 1000 == 0:
                print(f"Processed {file_count} files, total size: {total_size} bytes")
    
    return file_count, total_size

# Best practices
class GlobUtils:
    """Utility class with glob best practices."""
    
    @staticmethod
    def safe_glob(pattern, max_results=10000):
        """Safely glob with result limit."""
        results = []
        count = 0
        
        for filepath in glob.iglob(pattern, recursive=True):
            results.append(filepath)
            count += 1
            
            if count >= max_results:
                print(f"Warning: Limited to {max_results} results")
                break
        
        return results
    
    @staticmethod
    def glob_with_exclude(pattern, exclude_patterns=None):
        """Glob with exclusion patterns."""
        if exclude_patterns is None:
            exclude_patterns = []
        
        results = glob.glob(pattern, recursive=True)
        
        # Filter out excluded patterns
        filtered = []
        for filepath in results:
            excluded = False
            for exclude in exclude_patterns:
                if fnmatch.fnmatch(filepath, exclude):
                    excluded = True
                    break
            
            if not excluded:
                filtered.append(filepath)
        
        return filtered
    
    @staticmethod
    def glob_case_insensitive(pattern):
        """Case-insensitive globbing (manual implementation)."""
        import string
        
        # Convert pattern to case-insensitive regex-like pattern
        case_pattern = ""
        for char in pattern:
            if char.isalpha():
                case_pattern += f"[{char.lower()}{char.upper()}]"
            else:
                case_pattern += char
        
        return glob.glob(case_pattern, recursive=True)
    
    @staticmethod
    def validate_pattern(pattern):
        """Validate glob pattern syntax."""
        try:
            # Test the pattern with a dummy glob
            glob.glob(pattern)
            return True, "Pattern is valid"
        except Exception as e:
            return False, str(e)

# Usage examples
print("\nBest Practices Examples:")

# Safe globbing with limits
limited_results = GlobUtils.safe_glob('**/*', max_results=100)
print(f"Limited glob: {len(limited_results)} results")

# Exclude patterns
python_no_cache = GlobUtils.glob_with_exclude(
    '**/*.py',
    exclude_patterns=['**/__pycache__/**', '**/*.pyc']
)
print(f"Python files (no cache): {len(python_no_cache)}")

# Pattern validation
valid, message = GlobUtils.validate_pattern('**/*.py')
print(f"Pattern validation: {valid} - {message}")

# Case insensitive (where needed)
case_insensitive = GlobUtils.glob_case_insensitive('*.TXT')
print(f"Case insensitive .txt: {len(case_insensitive)}")

python code snippet end

Integration with Other Modules

python code snippet start

import glob
import json
import csv
from pathlib import Path
import concurrent.futures
import threading

# Integration examples
print("=== Module Integration ===")

def process_json_files(pattern='**/*.json'):
    """Process all JSON files matching pattern."""
    json_data = {}
    
    for json_file in glob.glob(pattern, recursive=True):
        try:
            with open(json_file, 'r') as f:
                data = json.load(f)
                json_data[json_file] = data
        except (json.JSONDecodeError, IOError) as e:
            print(f"Error processing {json_file}: {e}")
    
    return json_data

def merge_csv_files(pattern='**/*.csv', output_file='merged.csv'):
    """Merge multiple CSV files into one."""
    csv_files = glob.glob(pattern, recursive=True)
    
    if not csv_files:
        return False
    
    # Read first file to get headers
    with open(csv_files[0], 'r') as f:
        reader = csv.reader(f)
        headers = next(reader)
    
    # Write merged file
    with open(output_file, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(headers)
        
        for csv_file in csv_files:
            with open(csv_file, 'r') as infile:
                reader = csv.reader(infile)
                next(reader)  # Skip header
                writer.writerows(reader)
    
    return True

# Parallel processing with glob
def process_files_parallel(pattern, processor_func, max_workers=4):
    """Process files in parallel."""
    files = glob.glob(pattern, recursive=True)
    
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {
            executor.submit(processor_func, file): file 
            for file in files
        }
        
        for future in concurrent.futures.as_completed(future_to_file):
            file = future_to_file[future]
            try:
                result = future.result()
                results.append((file, result))
            except Exception as e:
                print(f"Error processing {file}: {e}")
    
    return results

# Example processor function
def analyze_file(filepath):
    """Analyze a single file."""
    try:
        stat_info = os.stat(filepath)
        return {
            'size': stat_info.st_size,
            'modified': stat_info.st_mtime,
            'type': 'file' if os.path.isfile(filepath) else 'directory'
        }
    except OSError:
        return None

# Usage examples
print("Integration examples:")

# Process JSON files
json_data = process_json_files('**/*.json')
print(f"JSON files processed: {len(json_data)}")

# Parallel file analysis
file_analysis = process_files_parallel('**/*.py', analyze_file, max_workers=2)
print(f"Files analyzed in parallel: {len(file_analysis)}")

# File monitoring with glob
class FileWatcher:
    """Simple file watcher using glob."""
    
    def __init__(self, pattern):
        self.pattern = pattern
        self.last_files = set(glob.glob(pattern, recursive=True))
    
    def check_changes(self):
        """Check for new or removed files."""
        current_files = set(glob.glob(self.pattern, recursive=True))
        
        new_files = current_files - self.last_files
        removed_files = self.last_files - current_files
        
        self.last_files = current_files
        
        return {
            'new': list(new_files),
            'removed': list(removed_files),
            'total': len(current_files)
        }

# Example watcher
watcher = FileWatcher('**/*.py')
changes = watcher.check_changes()
print(f"File changes: {changes['total']} total files")

python code snippet end

The glob module provides powerful and flexible file pattern matching that integrates seamlessly with Python’s file operations, making it essential for file management, data processing, and automation tasks. Use glob with pathlib for modern path handling and os for file operations . Combine with JSON processing and CSV file handling for data pipeline automation.

Reference: Python Glob Module Documentation