Skip to main content Brad's PyNotes

PEP 289: Generator Expressions - Memory-Efficient Data Processing

TL;DR

PEP 289 introduced generator expressions in Python 2.4, providing memory-efficient alternatives to list comprehensions using lazy evaluation with syntax (expression for item in iterable if condition).

Interesting!

Generator expressions use only a tiny amount of memory regardless of the input size - processing a billion-item dataset uses the same memory as processing 10 items, since they generate values on-demand rather than storing them all at once.

Basic Generator Expression Syntax

python code snippet start

# List comprehension (creates entire list in memory)
numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
squares_list = [x**2 for x in numbers if x % 2 == 0]
print(squares_list)  # [4, 16, 36, 64, 100]

# Generator expression (creates generator object)
squares_gen = (x**2 for x in numbers if x % 2 == 0)
print(squares_gen)  # <generator object at 0x...>

# Consume the generator
for square in squares_gen:
    print(square)  # 4, 16, 36, 64, 100

python code snippet end

Memory Efficiency Comparison

python code snippet start

import sys

# List comprehension - stores all values
large_list = [x for x in range(1000000)]
print(f"List size: {sys.getsizeof(large_list)} bytes")

# Generator expression - stores only the generator state
large_gen = (x for x in range(1000000))
print(f"Generator size: {sys.getsizeof(large_gen)} bytes")

# Generator is ~40,000x smaller in memory!
# List: ~8MB vs Generator: ~200 bytes

python code snippet end

Lazy Evaluation in Action

python code snippet start

def expensive_operation(x):
    """Simulate an expensive computation"""
    print(f"Processing {x}...")
    return x * x

numbers = [1, 2, 3, 4, 5]

# List comprehension - executes immediately
print("Creating list comprehension:")
squares_list = [expensive_operation(x) for x in numbers]
print("List created!")

print("\nCreating generator expression:")
squares_gen = (expensive_operation(x) for x in numbers)
print("Generator created!")  # No processing yet!

print("\nConsuming first two values:")
print(next(squares_gen))  # Only now does processing begin
print(next(squares_gen))

python code snippet end

Generator Expressions with Functions

python code snippet start

# sum() works efficiently with generators
numbers = range(1000000)
total = sum(x**2 for x in numbers if x % 2 == 0)
print(f"Sum of squares of even numbers: {total}")

# any() and all() with generators
data = [1, 2, 3, 4, 5]
has_even = any(x % 2 == 0 for x in data)
all_positive = all(x > 0 for x in data)

print(f"Has even number: {has_even}")
print(f"All positive: {all_positive}")

# max() and min() with generators
scores = [85, 92, 78, 96, 88]
best_score = max(score for score in scores if score >= 80)
print(f"Best qualifying score: {best_score}")

python code snippet end

Chaining Generator Expressions

python code snippet start

# Chain multiple generator expressions
numbers = range(100)

# First filter: even numbers
evens = (x for x in numbers if x % 2 == 0)

# Second transformation: square them
squares = (x**2 for x in evens)

# Third filter: only large squares
large_squares = (x for x in squares if x > 100)

# Consume the chain
result = list(large_squares)
print(f"Large squares of even numbers: {result[:10]}")

# More complex chaining
words = ["hello", "world", "python", "generator", "expression"]
result = (
    word.upper() 
    for word in words 
    if len(word) > 5
)
print(list(result))  # ['PYTHON', 'GENERATOR', 'EXPRESSION']

python code snippet end

File Processing with Generators

python code snippet start

# Process large files efficiently
def process_log_file(filename):
    """Process log file line by line without loading into memory"""
    with open(filename, 'r') as file:
        # Generator expression for error lines
        error_lines = (
            line.strip() 
            for line in file 
            if 'ERROR' in line
        )
        
        # Process errors one at a time
        for error_line in error_lines:
            # Extract timestamp, message, etc.
            yield process_error_line(error_line)

def process_error_line(line):
    """Extract useful information from error line"""
    parts = line.split(' - ')
    return {
        'timestamp': parts[0] if parts else '',
        'level': 'ERROR',
        'message': parts[-1] if parts else line
    }

# Usage (works with files of any size)
# for error in process_log_file('large_log.txt'):
#     print(error)

python code snippet end

Real-World Examples

Data Transformation Pipeline

python code snippet start

# Process CSV-like data efficiently
def process_sales_data(data_rows):
    """Transform sales data using generator expressions"""
    
    # Parse and clean data
    parsed_rows = (
        row.strip().split(',') 
        for row in data_rows 
        if row.strip()
    )
    
    # Convert to dictionaries
    sales_records = (
        {
            'product': row[0],
            'quantity': int(row[1]),
            'price': float(row[2])
        }
        for row in parsed_rows
        if len(row) == 3 and row[1].isdigit()
    )
    
    # Calculate totals
    sales_with_totals = (
        {**record, 'total': record['quantity'] * record['price']}
        for record in sales_records
    )
    
    # Filter significant sales
    significant_sales = (
        record 
        for record in sales_with_totals 
        if record['total'] > 100
    )
    
    return significant_sales

# Sample data
csv_data = [
    "Widget,10,15.50",
    "Gadget,5,25.00",
    "Tool,20,8.75",
    "",  # Empty line
    "Device,3,45.99"
]

# Process efficiently
for sale in process_sales_data(csv_data):
    print(f"{sale['product']}: ${sale['total']:.2f}")

python code snippet end

Mathematical Sequences

python code snippet start

# Generate mathematical sequences efficiently
def fibonacci_generator():
    """Generate Fibonacci numbers indefinitely"""
    a, b = 0, 1
    while True:
        yield a
        a, b = b, a + b

# First 10 Fibonacci numbers greater than 100
large_fibs = (
    fib for fib in fibonacci_generator() 
    if fib > 100
)

first_10_large = []
for i, fib in enumerate(large_fibs):
    if i >= 10:
        break
    first_10_large.append(fib)

print(f"First 10 Fibonacci numbers > 100: {first_10_large}")

# Prime number generator expression
def is_prime(n):
    if n < 2:
        return False
    return all(n % i != 0 for i in range(2, int(n**0.5) + 1))

# First 20 prime numbers
primes = (n for n in range(2, 1000) if is_prime(n))
first_20_primes = [next(primes) for _ in range(20)]
print(f"First 20 primes: {first_20_primes}")

python code snippet end

Text Processing

python code snippet start

# Process text efficiently
def analyze_text(text):
    """Analyze text using generator expressions"""
    
    # Split into words and clean
    words = (
        word.lower().strip('.,!?";()[]{}')
        for line in text.split('\n')
        for word in line.split()
        if word.strip()
    )
    
    # Filter meaningful words
    meaningful_words = (
        word for word in words 
        if len(word) > 2 and word.isalpha()
    )
    
    # Count word lengths
    word_lengths = (len(word) for word in meaningful_words)
    
    return word_lengths

sample_text = """
Hello, world! This is a sample text.
We are testing generator expressions.
They are memory efficient and powerful.
"""

lengths = list(analyze_text(sample_text))
print(f"Word lengths: {lengths}")
print(f"Average word length: {sum(lengths) / len(lengths):.2f}")

python code snippet end

Performance Comparison

python code snippet start

import time
import tracemalloc

def time_and_memory(func, *args):
    """Measure time and memory usage"""
    tracemalloc.start()
    start_time = time.time()
    
    result = func(*args)
    
    end_time = time.time()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    
    return result, end_time - start_time, peak

def list_comprehension_approach(n):
    """Process data using list comprehensions"""
    numbers = [x for x in range(n)]
    squares = [x**2 for x in numbers if x % 2 == 0]
    return sum(squares)

def generator_approach(n):
    """Process data using generator expressions"""
    squares = (x**2 for x in range(n) if x % 2 == 0)
    return sum(squares)

# Compare approaches
n = 1000000

result1, time1, memory1 = time_and_memory(list_comprehension_approach, n)
result2, time2, memory2 = time_and_memory(generator_approach, n)

print(f"List comprehension: {time1:.3f}s, {memory1/1024/1024:.1f}MB")
print(f"Generator expression: {time2:.3f}s, {memory2/1024/1024:.1f}MB")
print(f"Memory savings: {(memory1-memory2)/memory1*100:.1f}%")

python code snippet end

Best Practices

python code snippet start

# Use parentheses for clarity in complex expressions
# Good
result = sum(
    x**2 
    for x in range(100) 
    if x % 2 == 0
)

# Use generators for large datasets
def process_large_dataset(data_source):
    """Process large dataset efficiently"""
    return (
        transform_item(item)
        for item in data_source
        if validate_item(item)
    )

def transform_item(item):
    return item.upper()

def validate_item(item):
    return len(item) > 3

# Convert to list only when necessary
data = ["hello", "world", "python", "programming"]
processed = process_large_dataset(data)

# Only materialize when needed
result_list = list(processed)  # Now it's a list
print(result_list)

# Use generator expressions with built-in functions
# sum(), max(), min(), any(), all() work efficiently with generators
numbers = range(1000000)

# Memory-efficient operations
total = sum(x for x in numbers if x % 3 == 0)
maximum = max(x**2 for x in numbers if x % 2 == 0)
has_large = any(x > 999990 for x in numbers)

print(f"Sum of multiples of 3: {total}")
print(f"Max square of even numbers: {maximum}")
print(f"Has numbers > 999990: {has_large}")

python code snippet end

Common Pitfalls

python code snippet start

# Generator expressions are consumed once
gen = (x**2 for x in range(5))
print(list(gen))  # [0, 1, 4, 9, 16]
print(list(gen))  # [] - Generator is exhausted!

# Create new generator for multiple uses
def create_squares():
    return (x**2 for x in range(5))

gen1 = create_squares()
gen2 = create_squares()
print(list(gen1))  # [0, 1, 4, 9, 16]
print(list(gen2))  # [0, 1, 4, 9, 16]

# Be careful with variable scope in nested loops
# This creates unexpected behavior
squares = []
for i in range(3):
    squares.append((x for x in range(i)))

# All generators use the final value of i!
for gen in squares:
    print(list(gen))  # All print [0, 1] (i=2)

# Fix with default arguments
squares = []
for i in range(3):
    squares.append((x for x in range(i)) for i in [i])  # Capture i

# Or use a factory function
def make_generator(n):
    return (x for x in range(n))

squares = [make_generator(i) for i in range(3)]

python code snippet end

Generator expressions revolutionize how we process data in Python, enabling memory-efficient operations on datasets of any size while maintaining clean, readable code. They work seamlessly with itertools functions and built-in functions like sum() . For complex data processing, combine with functools operations and CSV file processing .

Reference: PEP 289 - Generator Expressions