Glob Module: Unix-Style Pathname Pattern Matching
TL;DR
The glob module finds all pathnames matching a Unix shell-style pattern using wildcards like *
(any characters), ?
(single character), and [seq]
(character ranges), making file discovery and batch operations simple.
Interesting!
The glob module gets its name from “global command” - the original Unix shell feature that expanded wildcards. Python’s implementation is more powerful than basic shell globbing, supporting recursive patterns with **
and advanced filtering options.
Basic Glob Patterns
python code snippet start
import glob
import os
# Create some test files for demonstration
test_files = [
'data.txt', 'data.csv', 'backup.txt', 'image.png',
'document.pdf', 'script.py', 'config.json'
]
# Basic wildcard patterns
print("=== Basic Patterns ===")
# * matches any number of characters
txt_files = glob.glob('*.txt')
print(f"Text files: {txt_files}")
# ? matches exactly one character
single_char = glob.glob('data.???')
print(f"data.??? pattern: {single_char}")
# Character ranges and sets
brackets = glob.glob('[cd]*') # Files starting with 'c' or 'd'
print(f"Files starting with c or d: {brackets}")
# Negation with !
not_txt = glob.glob('*.[!t]*') # Files not ending with .t + one char
print(f"Not .txt files: {not_txt}")
# Multiple patterns
patterns = ['*.py', '*.txt', '*.json']
all_matches = []
for pattern in patterns:
all_matches.extend(glob.glob(pattern))
print(f"Multiple patterns: {all_matches}")
python code snippet end
Advanced Pattern Matching
python code snippet start
import glob
from pathlib import Path
# Create directory structure for examples
def create_test_structure():
"""Create test directory structure."""
dirs = [
'project/src/main',
'project/src/utils',
'project/tests/unit',
'project/tests/integration',
'project/docs/api',
'project/data/raw',
'project/data/processed'
]
files = [
'project/README.md',
'project/setup.py',
'project/src/main/app.py',
'project/src/main/models.py',
'project/src/utils/helpers.py',
'project/src/utils/config.py',
'project/tests/unit/test_models.py',
'project/tests/unit/test_helpers.py',
'project/tests/integration/test_app.py',
'project/docs/api/reference.md',
'project/data/raw/dataset.csv',
'project/data/processed/clean_data.csv'
]
# In practice, you'd create these with os.makedirs and open()
return dirs, files
# Character classes and ranges
print("=== Character Classes ===")
# Range patterns
alpha_files = glob.glob('[a-z]*') # Files starting with lowercase
print(f"Lowercase start: {alpha_files}")
numeric_files = glob.glob('*[0-9]*') # Files containing digits
print(f"Contains digits: {numeric_files}")
# Complex character sets
complex_pattern = glob.glob('[!._]*[a-zA-Z]') # Not starting with . or _, ending with letter
print(f"Complex pattern: {complex_pattern}")
# Case sensitivity (platform dependent)
case_files = glob.glob('[Dd]ata*') # Files starting with 'Data' or 'data'
print(f"Case variants: {case_files}")
python code snippet end
Recursive Globbing
python code snippet start
import glob
import os
# Recursive pattern matching with **
print("=== Recursive Patterns ===")
# Find all Python files recursively
all_python = glob.glob('**/*.py', recursive=True)
print(f"All Python files: {all_python}")
# Find all files in subdirectories
all_files = glob.glob('**/*', recursive=True)
print(f"Total files found: {len(all_files)}")
# Specific directory patterns
test_files = glob.glob('**/tests/**/*.py', recursive=True)
print(f"Test files: {test_files}")
# Find configuration files anywhere
config_files = glob.glob('**/config.*', recursive=True)
print(f"Config files: {config_files}")
# Limit recursion depth (manual approach)
def glob_with_depth(pattern, max_depth=2):
"""Glob with maximum recursion depth."""
results = []
# Build patterns for each depth level
for depth in range(max_depth + 1):
depth_pattern = '/'.join(['*'] * depth) + '/' + pattern
if depth == 0:
depth_pattern = pattern
matches = glob.glob(depth_pattern)
results.extend(matches)
return list(set(results)) # Remove duplicates
limited_search = glob_with_depth('*.py', max_depth=2)
print(f"Limited depth search: {limited_search}")
python code snippet end
Advanced Glob Functions
python code snippet start
import glob
import os
from pathlib import Path
# glob.iglob() - iterator version for large results
print("=== Iterator Globbing ===")
def process_large_directory():
"""Process large directories efficiently with iglob."""
# Iterator doesn't load all results into memory at once
for filepath in glob.iglob('**/*', recursive=True):
if os.path.isfile(filepath):
size = os.path.getsize(filepath)
if size > 1024: # Files larger than 1KB
print(f"Large file: {filepath} ({size} bytes)")
# Process one file at a time - memory efficient
# break after a few for demo
if filepath.endswith('.py'):
break
# glob.escape() - escape special characters
print("\n=== Escaping Special Characters ===")
def safe_glob(filename_with_specials):
"""Safely glob filenames containing special characters."""
# If you have files with [], *, ? in their names
escaped = glob.escape(filename_with_specials)
return glob.glob(escaped)
# Example: file named "data[2024].txt"
special_filename = "data[2024].txt"
escaped_pattern = glob.escape(special_filename)
print(f"Original: {special_filename}")
print(f"Escaped: {escaped_pattern}")
# glob with different working directory
def glob_in_directory(directory, pattern):
"""Glob in a specific directory."""
old_cwd = os.getcwd()
try:
os.chdir(directory)
results = glob.glob(pattern)
# Convert to absolute paths
return [os.path.abspath(f) for f in results]
finally:
os.chdir(old_cwd)
# Alternative: use pathlib for cleaner code
def pathlib_glob(directory, pattern):
"""Use pathlib for directory-specific globbing."""
path = Path(directory)
return list(path.glob(pattern))
# Example usage
if Path('project').exists():
project_files = pathlib_glob('project', '**/*.py')
print(f"Project Python files: {project_files}")
python code snippet end
File Type and Attribute Filtering
python code snippet start
import glob
import os
import stat
from datetime import datetime, timedelta
# Filter by file attributes
print("=== File Attribute Filtering ===")
def filter_by_size(pattern, min_size=0, max_size=float('inf')):
"""Filter glob results by file size."""
results = []
for filepath in glob.glob(pattern):
if os.path.isfile(filepath):
size = os.path.getsize(filepath)
if min_size <= size <= max_size:
results.append((filepath, size))
return results
def filter_by_modification_time(pattern, days_old=7):
"""Filter files modified within specified days."""
cutoff = datetime.now() - timedelta(days=days_old)
results = []
for filepath in glob.glob(pattern):
if os.path.isfile(filepath):
mtime = datetime.fromtimestamp(os.path.getmtime(filepath))
if mtime >= cutoff:
results.append((filepath, mtime))
return results
def filter_by_permissions(pattern, permission_check):
"""Filter files by permissions."""
results = []
for filepath in glob.glob(pattern):
if os.path.exists(filepath):
mode = os.stat(filepath).st_mode
if permission_check(mode):
results.append(filepath)
return results
# Usage examples
large_files = filter_by_size('**/*', min_size=1024, max_size=1024*1024)
print(f"Files 1KB-1MB: {len(large_files)}")
recent_files = filter_by_modification_time('**/*.py', days_old=30)
print(f"Python files modified in last 30 days: {len(recent_files)}")
# Check for executable files
executable_files = filter_by_permissions(
'**/*',
lambda mode: bool(mode & stat.S_IXUSR)
)
print(f"Executable files: {len(executable_files)}")
# Readable files
readable_files = filter_by_permissions(
'**/*',
lambda mode: bool(mode & stat.S_IRUSR)
)
print(f"Readable files: {len(readable_files)}")
python code snippet end
Advanced Pattern Examples
python code snippet start
import glob
import re
from collections import defaultdict
# Complex real-world patterns
print("=== Real-World Patterns ===")
# Log file analysis
def find_log_files(date_pattern=None):
"""Find log files with optional date filtering."""
patterns = [
'**/*.log',
'**/*log*',
'**/logs/**/*',
'**/*.log.*' # Rotated logs
]
all_logs = []
for pattern in patterns:
all_logs.extend(glob.glob(pattern, recursive=True))
# Remove duplicates
unique_logs = list(set(all_logs))
if date_pattern:
# Filter by date pattern in filename
date_filtered = [
log for log in unique_logs
if re.search(date_pattern, log)
]
return date_filtered
return unique_logs
# Find backup files
def find_backup_files():
"""Find various backup file patterns."""
backup_patterns = [
'**/*.bak',
'**/*.backup',
'**/*~', # Emacs backups
'**/*.orig', # Original files
'**/#*#', # Emacs auto-save
'**/.#*', # More Emacs files
'**/Backup of *', # Windows backup pattern
'**/*_backup_*', # Custom backup pattern
'**/*.old'
]
backups = []
for pattern in backup_patterns:
backups.extend(glob.glob(pattern, recursive=True))
return list(set(backups))
# Find media files by type
def categorize_media_files():
"""Categorize media files by type."""
media_patterns = {
'images': ['**/*.jpg', '**/*.jpeg', '**/*.png', '**/*.gif', '**/*.bmp', '**/*.svg'],
'videos': ['**/*.mp4', '**/*.avi', '**/*.mov', '**/*.mkv', '**/*.wmv', '**/*.flv'],
'audio': ['**/*.mp3', '**/*.wav', '**/*.flac', '**/*.ogg', '**/*.m4a', '**/*.aac'],
'documents': ['**/*.pdf', '**/*.doc', '**/*.docx', '**/*.txt', '**/*.rtf']
}
categorized = defaultdict(list)
for category, patterns in media_patterns.items():
for pattern in patterns:
files = glob.glob(pattern, recursive=True)
categorized[category].extend(files)
return dict(categorized)
# Find source code files
def find_source_files():
"""Find source code files by language."""
languages = {
'python': ['**/*.py', '**/*.pyw', '**/*.pyx'],
'javascript': ['**/*.js', '**/*.jsx', '**/*.ts', '**/*.tsx'],
'web': ['**/*.html', '**/*.htm', '**/*.css', '**/*.scss', '**/*.sass'],
'c_cpp': ['**/*.c', '**/*.cpp', '**/*.h', '**/*.hpp', '**/*.cc'],
'java': ['**/*.java', '**/*.class', '**/*.jar'],
'config': ['**/*.json', '**/*.yaml', '**/*.yml', '**/*.xml', '**/*.ini', '**/*.cfg']
}
source_files = {}
for lang, patterns in languages.items():
files = []
for pattern in patterns:
files.extend(glob.glob(pattern, recursive=True))
source_files[lang] = list(set(files))
return source_files
# Usage examples
logs = find_log_files(r'\d{4}-\d{2}-\d{2}') # Logs with date pattern
print(f"Date-stamped logs: {len(logs)}")
backups = find_backup_files()
print(f"Backup files found: {len(backups)}")
media = categorize_media_files()
for category, files in media.items():
print(f"{category.title()}: {len(files)} files")
source = find_source_files()
for lang, files in source.items():
print(f"{lang.title()}: {len(files)} files")
python code snippet end
Practical File Operations with Glob
python code snippet start
import glob
import os
import shutil
from pathlib import Path
import hashlib
# Batch file operations
print("=== Batch Operations ===")
def batch_rename(pattern, prefix="", suffix=""):
"""Batch rename files matching pattern."""
files = glob.glob(pattern)
renamed = []
for filepath in files:
directory = os.path.dirname(filepath)
filename = os.path.basename(filepath)
name, ext = os.path.splitext(filename)
new_name = f"{prefix}{name}{suffix}{ext}"
new_path = os.path.join(directory, new_name)
try:
os.rename(filepath, new_path)
renamed.append((filepath, new_path))
except OSError as e:
print(f"Error renaming {filepath}: {e}")
return renamed
def batch_copy(pattern, destination):
"""Copy all files matching pattern to destination."""
files = glob.glob(pattern, recursive=True)
os.makedirs(destination, exist_ok=True)
copied = []
for filepath in files:
if os.path.isfile(filepath):
filename = os.path.basename(filepath)
dest_path = os.path.join(destination, filename)
try:
shutil.copy2(filepath, dest_path)
copied.append((filepath, dest_path))
except OSError as e:
print(f"Error copying {filepath}: {e}")
return copied
def find_duplicates(pattern):
"""Find duplicate files by content hash."""
files = glob.glob(pattern, recursive=True)
hash_map = defaultdict(list)
for filepath in files:
if os.path.isfile(filepath):
try:
with open(filepath, 'rb') as f:
file_hash = hashlib.md5(f.read()).hexdigest()
hash_map[file_hash].append(filepath)
except OSError:
continue
# Return only groups with duplicates
duplicates = {k: v for k, v in hash_map.items() if len(v) > 1}
return duplicates
def cleanup_temp_files():
"""Clean up temporary files safely."""
temp_patterns = [
'**/*.tmp',
'**/*.temp',
'**/.*~',
'**/#*#',
'**/*.pyc',
'**/__pycache__/**/*',
'**/node_modules/**/*',
'**/.DS_Store'
]
cleaned = []
for pattern in temp_patterns:
files = glob.glob(pattern, recursive=True)
for filepath in files:
try:
if os.path.isfile(filepath):
os.remove(filepath)
cleaned.append(filepath)
elif os.path.isdir(filepath) and not os.listdir(filepath):
os.rmdir(filepath) # Remove empty directories
cleaned.append(filepath)
except OSError as e:
print(f"Could not remove {filepath}: {e}")
return cleaned
# File organization
def organize_files_by_type(source_pattern, base_dir="organized"):
"""Organize files by type into subdirectories."""
file_types = {
'documents': ['.pdf', '.doc', '.docx', '.txt', '.rtf'],
'images': ['.jpg', '.jpeg', '.png', '.gif', '.bmp'],
'videos': ['.mp4', '.avi', '.mov', '.mkv'],
'audio': ['.mp3', '.wav', '.flac', '.ogg'],
'archives': ['.zip', '.rar', '.tar', '.gz', '.7z'],
'code': ['.py', '.js', '.html', '.css', '.cpp', '.java']
}
files = glob.glob(source_pattern)
organized = defaultdict(list)
for filepath in files:
if not os.path.isfile(filepath):
continue
_, ext = os.path.splitext(filepath.lower())
category = 'other' # Default category
# Find matching category
for cat, extensions in file_types.items():
if ext in extensions:
category = cat
break
# Create category directory
category_dir = os.path.join(base_dir, category)
os.makedirs(category_dir, exist_ok=True)
# Move file
filename = os.path.basename(filepath)
dest_path = os.path.join(category_dir, filename)
try:
shutil.move(filepath, dest_path)
organized[category].append((filepath, dest_path))
except OSError as e:
print(f"Error moving {filepath}: {e}")
return dict(organized)
# Example usage (be careful with file operations!)
print("Example file operations (disabled for safety):")
print("# batch_rename('*.txt', prefix='backup_')")
print("# batch_copy('**/*.py', 'python_files')")
print("# duplicates = find_duplicates('**/*')")
print("# cleanup_temp_files()")
print("# organize_files_by_type('Downloads/*')")
python code snippet end
Performance and Best Practices
python code snippet start
import glob
import time
from pathlib import Path
import fnmatch
# Performance comparison
print("=== Performance Tips ===")
def benchmark_glob_methods():
"""Compare different globbing approaches."""
# Method 1: Basic glob
start = time.time()
result1 = glob.glob('**/*.py', recursive=True)
time1 = time.time() - start
# Method 2: pathlib
start = time.time()
result2 = list(Path('.').rglob('*.py'))
time2 = time.time() - start
# Method 3: os.walk with fnmatch
start = time.time()
result3 = []
for root, dirs, files in os.walk('.'):
for file in files:
if fnmatch.fnmatch(file, '*.py'):
result3.append(os.path.join(root, file))
time3 = time.time() - start
print(f"glob.glob: {time1:.4f}s ({len(result1)} files)")
print(f"pathlib: {time2:.4f}s ({len(result2)} files)")
print(f"os.walk: {time3:.4f}s ({len(result3)} files)")
return result1, result2, result3
# Memory-efficient processing
def process_large_directory_efficiently():
"""Process large directories without loading all filenames."""
# Use iglob for memory efficiency
total_size = 0
file_count = 0
for filepath in glob.iglob('**/*', recursive=True):
if os.path.isfile(filepath):
total_size += os.path.getsize(filepath)
file_count += 1
# Process in chunks to avoid memory issues
if file_count % 1000 == 0:
print(f"Processed {file_count} files, total size: {total_size} bytes")
return file_count, total_size
# Best practices
class GlobUtils:
"""Utility class with glob best practices."""
@staticmethod
def safe_glob(pattern, max_results=10000):
"""Safely glob with result limit."""
results = []
count = 0
for filepath in glob.iglob(pattern, recursive=True):
results.append(filepath)
count += 1
if count >= max_results:
print(f"Warning: Limited to {max_results} results")
break
return results
@staticmethod
def glob_with_exclude(pattern, exclude_patterns=None):
"""Glob with exclusion patterns."""
if exclude_patterns is None:
exclude_patterns = []
results = glob.glob(pattern, recursive=True)
# Filter out excluded patterns
filtered = []
for filepath in results:
excluded = False
for exclude in exclude_patterns:
if fnmatch.fnmatch(filepath, exclude):
excluded = True
break
if not excluded:
filtered.append(filepath)
return filtered
@staticmethod
def glob_case_insensitive(pattern):
"""Case-insensitive globbing (manual implementation)."""
import string
# Convert pattern to case-insensitive regex-like pattern
case_pattern = ""
for char in pattern:
if char.isalpha():
case_pattern += f"[{char.lower()}{char.upper()}]"
else:
case_pattern += char
return glob.glob(case_pattern, recursive=True)
@staticmethod
def validate_pattern(pattern):
"""Validate glob pattern syntax."""
try:
# Test the pattern with a dummy glob
glob.glob(pattern)
return True, "Pattern is valid"
except Exception as e:
return False, str(e)
# Usage examples
print("\nBest Practices Examples:")
# Safe globbing with limits
limited_results = GlobUtils.safe_glob('**/*', max_results=100)
print(f"Limited glob: {len(limited_results)} results")
# Exclude patterns
python_no_cache = GlobUtils.glob_with_exclude(
'**/*.py',
exclude_patterns=['**/__pycache__/**', '**/*.pyc']
)
print(f"Python files (no cache): {len(python_no_cache)}")
# Pattern validation
valid, message = GlobUtils.validate_pattern('**/*.py')
print(f"Pattern validation: {valid} - {message}")
# Case insensitive (where needed)
case_insensitive = GlobUtils.glob_case_insensitive('*.TXT')
print(f"Case insensitive .txt: {len(case_insensitive)}")
python code snippet end
Integration with Other Modules
python code snippet start
import glob
import json
import csv
from pathlib import Path
import concurrent.futures
import threading
# Integration examples
print("=== Module Integration ===")
def process_json_files(pattern='**/*.json'):
"""Process all JSON files matching pattern."""
json_data = {}
for json_file in glob.glob(pattern, recursive=True):
try:
with open(json_file, 'r') as f:
data = json.load(f)
json_data[json_file] = data
except (json.JSONDecodeError, IOError) as e:
print(f"Error processing {json_file}: {e}")
return json_data
def merge_csv_files(pattern='**/*.csv', output_file='merged.csv'):
"""Merge multiple CSV files into one."""
csv_files = glob.glob(pattern, recursive=True)
if not csv_files:
return False
# Read first file to get headers
with open(csv_files[0], 'r') as f:
reader = csv.reader(f)
headers = next(reader)
# Write merged file
with open(output_file, 'w', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerow(headers)
for csv_file in csv_files:
with open(csv_file, 'r') as infile:
reader = csv.reader(infile)
next(reader) # Skip header
writer.writerows(reader)
return True
# Parallel processing with glob
def process_files_parallel(pattern, processor_func, max_workers=4):
"""Process files in parallel."""
files = glob.glob(pattern, recursive=True)
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_file = {
executor.submit(processor_func, file): file
for file in files
}
for future in concurrent.futures.as_completed(future_to_file):
file = future_to_file[future]
try:
result = future.result()
results.append((file, result))
except Exception as e:
print(f"Error processing {file}: {e}")
return results
# Example processor function
def analyze_file(filepath):
"""Analyze a single file."""
try:
stat_info = os.stat(filepath)
return {
'size': stat_info.st_size,
'modified': stat_info.st_mtime,
'type': 'file' if os.path.isfile(filepath) else 'directory'
}
except OSError:
return None
# Usage examples
print("Integration examples:")
# Process JSON files
json_data = process_json_files('**/*.json')
print(f"JSON files processed: {len(json_data)}")
# Parallel file analysis
file_analysis = process_files_parallel('**/*.py', analyze_file, max_workers=2)
print(f"Files analyzed in parallel: {len(file_analysis)}")
# File monitoring with glob
class FileWatcher:
"""Simple file watcher using glob."""
def __init__(self, pattern):
self.pattern = pattern
self.last_files = set(glob.glob(pattern, recursive=True))
def check_changes(self):
"""Check for new or removed files."""
current_files = set(glob.glob(self.pattern, recursive=True))
new_files = current_files - self.last_files
removed_files = self.last_files - current_files
self.last_files = current_files
return {
'new': list(new_files),
'removed': list(removed_files),
'total': len(current_files)
}
# Example watcher
watcher = FileWatcher('**/*.py')
changes = watcher.check_changes()
print(f"File changes: {changes['total']} total files")
python code snippet end
The glob module provides powerful and flexible file pattern matching that integrates seamlessly with Python’s file operations, making it essential for file management, data processing, and automation tasks. Use glob with pathlib for modern path handling and os for file operations . Combine with JSON processing and CSV file handling for data pipeline automation.
Reference: Python Glob Module Documentation