PEP 289: Generator Expressions - Memory-Efficient Data Processing
TL;DR
PEP 289 introduced generator expressions in Python 2.4, providing memory-efficient alternatives to list comprehensions using lazy evaluation with syntax (expression for item in iterable if condition)
.
Interesting!
Generator expressions use only a tiny amount of memory regardless of the input size - processing a billion-item dataset uses the same memory as processing 10 items, since they generate values on-demand rather than storing them all at once.
Basic Generator Expression Syntax
python code snippet start
# List comprehension (creates entire list in memory)
numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
squares_list = [x**2 for x in numbers if x % 2 == 0]
print(squares_list) # [4, 16, 36, 64, 100]
# Generator expression (creates generator object)
squares_gen = (x**2 for x in numbers if x % 2 == 0)
print(squares_gen) # <generator object at 0x...>
# Consume the generator
for square in squares_gen:
print(square) # 4, 16, 36, 64, 100
python code snippet end
Memory Efficiency Comparison
python code snippet start
import sys
# List comprehension - stores all values
large_list = [x for x in range(1000000)]
print(f"List size: {sys.getsizeof(large_list)} bytes")
# Generator expression - stores only the generator state
large_gen = (x for x in range(1000000))
print(f"Generator size: {sys.getsizeof(large_gen)} bytes")
# Generator is ~40,000x smaller in memory!
# List: ~8MB vs Generator: ~200 bytes
python code snippet end
Lazy Evaluation in Action
python code snippet start
def expensive_operation(x):
"""Simulate an expensive computation"""
print(f"Processing {x}...")
return x * x
numbers = [1, 2, 3, 4, 5]
# List comprehension - executes immediately
print("Creating list comprehension:")
squares_list = [expensive_operation(x) for x in numbers]
print("List created!")
print("\nCreating generator expression:")
squares_gen = (expensive_operation(x) for x in numbers)
print("Generator created!") # No processing yet!
print("\nConsuming first two values:")
print(next(squares_gen)) # Only now does processing begin
print(next(squares_gen))
python code snippet end
Generator Expressions with Functions
python code snippet start
# sum() works efficiently with generators
numbers = range(1000000)
total = sum(x**2 for x in numbers if x % 2 == 0)
print(f"Sum of squares of even numbers: {total}")
# any() and all() with generators
data = [1, 2, 3, 4, 5]
has_even = any(x % 2 == 0 for x in data)
all_positive = all(x > 0 for x in data)
print(f"Has even number: {has_even}")
print(f"All positive: {all_positive}")
# max() and min() with generators
scores = [85, 92, 78, 96, 88]
best_score = max(score for score in scores if score >= 80)
print(f"Best qualifying score: {best_score}")
python code snippet end
Chaining Generator Expressions
python code snippet start
# Chain multiple generator expressions
numbers = range(100)
# First filter: even numbers
evens = (x for x in numbers if x % 2 == 0)
# Second transformation: square them
squares = (x**2 for x in evens)
# Third filter: only large squares
large_squares = (x for x in squares if x > 100)
# Consume the chain
result = list(large_squares)
print(f"Large squares of even numbers: {result[:10]}")
# More complex chaining
words = ["hello", "world", "python", "generator", "expression"]
result = (
word.upper()
for word in words
if len(word) > 5
)
print(list(result)) # ['PYTHON', 'GENERATOR', 'EXPRESSION']
python code snippet end
File Processing with Generators
python code snippet start
# Process large files efficiently
def process_log_file(filename):
"""Process log file line by line without loading into memory"""
with open(filename, 'r') as file:
# Generator expression for error lines
error_lines = (
line.strip()
for line in file
if 'ERROR' in line
)
# Process errors one at a time
for error_line in error_lines:
# Extract timestamp, message, etc.
yield process_error_line(error_line)
def process_error_line(line):
"""Extract useful information from error line"""
parts = line.split(' - ')
return {
'timestamp': parts[0] if parts else '',
'level': 'ERROR',
'message': parts[-1] if parts else line
}
# Usage (works with files of any size)
# for error in process_log_file('large_log.txt'):
# print(error)
python code snippet end
Real-World Examples
Data Transformation Pipeline
python code snippet start
# Process CSV-like data efficiently
def process_sales_data(data_rows):
"""Transform sales data using generator expressions"""
# Parse and clean data
parsed_rows = (
row.strip().split(',')
for row in data_rows
if row.strip()
)
# Convert to dictionaries
sales_records = (
{
'product': row[0],
'quantity': int(row[1]),
'price': float(row[2])
}
for row in parsed_rows
if len(row) == 3 and row[1].isdigit()
)
# Calculate totals
sales_with_totals = (
{**record, 'total': record['quantity'] * record['price']}
for record in sales_records
)
# Filter significant sales
significant_sales = (
record
for record in sales_with_totals
if record['total'] > 100
)
return significant_sales
# Sample data
csv_data = [
"Widget,10,15.50",
"Gadget,5,25.00",
"Tool,20,8.75",
"", # Empty line
"Device,3,45.99"
]
# Process efficiently
for sale in process_sales_data(csv_data):
print(f"{sale['product']}: ${sale['total']:.2f}")
python code snippet end
Mathematical Sequences
python code snippet start
# Generate mathematical sequences efficiently
def fibonacci_generator():
"""Generate Fibonacci numbers indefinitely"""
a, b = 0, 1
while True:
yield a
a, b = b, a + b
# First 10 Fibonacci numbers greater than 100
large_fibs = (
fib for fib in fibonacci_generator()
if fib > 100
)
first_10_large = []
for i, fib in enumerate(large_fibs):
if i >= 10:
break
first_10_large.append(fib)
print(f"First 10 Fibonacci numbers > 100: {first_10_large}")
# Prime number generator expression
def is_prime(n):
if n < 2:
return False
return all(n % i != 0 for i in range(2, int(n**0.5) + 1))
# First 20 prime numbers
primes = (n for n in range(2, 1000) if is_prime(n))
first_20_primes = [next(primes) for _ in range(20)]
print(f"First 20 primes: {first_20_primes}")
python code snippet end
Text Processing
python code snippet start
# Process text efficiently
def analyze_text(text):
"""Analyze text using generator expressions"""
# Split into words and clean
words = (
word.lower().strip('.,!?";()[]{}')
for line in text.split('\n')
for word in line.split()
if word.strip()
)
# Filter meaningful words
meaningful_words = (
word for word in words
if len(word) > 2 and word.isalpha()
)
# Count word lengths
word_lengths = (len(word) for word in meaningful_words)
return word_lengths
sample_text = """
Hello, world! This is a sample text.
We are testing generator expressions.
They are memory efficient and powerful.
"""
lengths = list(analyze_text(sample_text))
print(f"Word lengths: {lengths}")
print(f"Average word length: {sum(lengths) / len(lengths):.2f}")
python code snippet end
Performance Comparison
python code snippet start
import time
import tracemalloc
def time_and_memory(func, *args):
"""Measure time and memory usage"""
tracemalloc.start()
start_time = time.time()
result = func(*args)
end_time = time.time()
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
return result, end_time - start_time, peak
def list_comprehension_approach(n):
"""Process data using list comprehensions"""
numbers = [x for x in range(n)]
squares = [x**2 for x in numbers if x % 2 == 0]
return sum(squares)
def generator_approach(n):
"""Process data using generator expressions"""
squares = (x**2 for x in range(n) if x % 2 == 0)
return sum(squares)
# Compare approaches
n = 1000000
result1, time1, memory1 = time_and_memory(list_comprehension_approach, n)
result2, time2, memory2 = time_and_memory(generator_approach, n)
print(f"List comprehension: {time1:.3f}s, {memory1/1024/1024:.1f}MB")
print(f"Generator expression: {time2:.3f}s, {memory2/1024/1024:.1f}MB")
print(f"Memory savings: {(memory1-memory2)/memory1*100:.1f}%")
python code snippet end
Best Practices
python code snippet start
# Use parentheses for clarity in complex expressions
# Good
result = sum(
x**2
for x in range(100)
if x % 2 == 0
)
# Use generators for large datasets
def process_large_dataset(data_source):
"""Process large dataset efficiently"""
return (
transform_item(item)
for item in data_source
if validate_item(item)
)
def transform_item(item):
return item.upper()
def validate_item(item):
return len(item) > 3
# Convert to list only when necessary
data = ["hello", "world", "python", "programming"]
processed = process_large_dataset(data)
# Only materialize when needed
result_list = list(processed) # Now it's a list
print(result_list)
# Use generator expressions with built-in functions
# sum(), max(), min(), any(), all() work efficiently with generators
numbers = range(1000000)
# Memory-efficient operations
total = sum(x for x in numbers if x % 3 == 0)
maximum = max(x**2 for x in numbers if x % 2 == 0)
has_large = any(x > 999990 for x in numbers)
print(f"Sum of multiples of 3: {total}")
print(f"Max square of even numbers: {maximum}")
print(f"Has numbers > 999990: {has_large}")
python code snippet end
Common Pitfalls
python code snippet start
# Generator expressions are consumed once
gen = (x**2 for x in range(5))
print(list(gen)) # [0, 1, 4, 9, 16]
print(list(gen)) # [] - Generator is exhausted!
# Create new generator for multiple uses
def create_squares():
return (x**2 for x in range(5))
gen1 = create_squares()
gen2 = create_squares()
print(list(gen1)) # [0, 1, 4, 9, 16]
print(list(gen2)) # [0, 1, 4, 9, 16]
# Be careful with variable scope in nested loops
# This creates unexpected behavior
squares = []
for i in range(3):
squares.append((x for x in range(i)))
# All generators use the final value of i!
for gen in squares:
print(list(gen)) # All print [0, 1] (i=2)
# Fix with default arguments
squares = []
for i in range(3):
squares.append((x for x in range(i)) for i in [i]) # Capture i
# Or use a factory function
def make_generator(n):
return (x for x in range(n))
squares = [make_generator(i) for i in range(3)]
python code snippet end
Generator expressions revolutionize how we process data in Python, enabling memory-efficient operations on datasets of any size while maintaining clean, readable code. They work seamlessly with itertools functions and built-in functions like sum() . For complex data processing, combine with functools operations and CSV file processing .
Reference: PEP 289 - Generator Expressions