Skip to main content Brad's PyNotes

CSV Module: Easy CSV File Reading and Writing

TL;DR

The csv module provides csv.reader(), csv.writer(), csv.DictReader(), and csv.DictWriter() for robust CSV file processing with automatic dialect detection and proper handling of quotes, delimiters, and line endings.

Interesting!

The csv module can automatically detect CSV dialects (delimiter, quote style, etc.) from sample data, and it properly handles tricky cases like embedded commas, quotes, and newlines within CSV fields that would break naive string splitting.

Basic CSV Reading

python code snippet start

import csv

# Reading with csv.reader()
with open('data.csv', 'r') as file:
    reader = csv.reader(file)
    
    # Skip header row
    headers = next(reader)
    print(f"Headers: {headers}")
    
    # Process each row
    for row in reader:
        print(row)  # Each row is a list
        print(f"Name: {row[0]}, Age: {row[1]}")

# Sample data.csv:
# Name,Age,City
# Alice,25,New York
# Bob,30,London

python code snippet end

DictReader for Named Access

python code snippet start

import csv

# Reading with DictReader (recommended)
with open('employees.csv', 'r') as file:
    reader = csv.DictReader(file)
    
    for row in reader:
        # Each row is a dictionary
        print(f"Name: {row['name']}")
        print(f"Department: {row['department']}")
        print(f"Salary: ${row['salary']}")
        print("---")

# Alternative: specify fieldnames explicitly
with open('data.csv', 'r') as file:
    reader = csv.DictReader(file, fieldnames=['name', 'age', 'city'])
    next(reader)  # Skip header if present
    
    for row in reader:
        print(row['name'], row['age'])

python code snippet end

Writing CSV Files

python code snippet start

import csv

# Writing with csv.writer()
data = [
    ['Name', 'Age', 'City'],
    ['Alice', 25, 'New York'],
    ['Bob', 30, 'London'],
    ['Charlie', 35, 'Tokyo']
]

with open('output.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    
    # Write header
    writer.writerow(data[0])
    
    # Write data rows
    writer.writerows(data[1:])

# Writing individual rows
with open('people.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    
    for person in people_data:
        writer.writerow([person.name, person.age, person.city])

python code snippet end

DictWriter for Dictionary Data

python code snippet start

import csv

# Writing with DictWriter (recommended)
employees = [
    {'name': 'Alice', 'department': 'Engineering', 'salary': 75000},
    {'name': 'Bob', 'department': 'Sales', 'salary': 65000},
    {'name': 'Charlie', 'department': 'Marketing', 'salary': 70000}
]

with open('employees.csv', 'w', newline='') as file:
    fieldnames = ['name', 'department', 'salary']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    
    # Write header
    writer.writeheader()
    
    # Write data
    for employee in employees:
        writer.writerow(employee)
    
    # Or write all at once
    # writer.writerows(employees)

python code snippet end

Custom Dialects and Formatting

python code snippet start

import csv

# Custom dialect
class MyDialect(csv.excel):
    delimiter = '|'
    quotechar = '"'
    doublequote = True
    skipinitialspace = True
    lineterminator = '\n'
    quoting = csv.QUOTE_MINIMAL

# Register custom dialect
csv.register_dialect('pipes', MyDialect)

# Use custom dialect
with open('pipe_data.csv', 'w', newline='') as file:
    writer = csv.writer(file, dialect='pipes')
    writer.writerow(['Name', 'Age', 'City'])
    writer.writerow(['Alice', 25, 'New York'])

# Reading with custom dialect
with open('pipe_data.csv', 'r') as file:
    reader = csv.reader(file, dialect='pipes')
    for row in reader:
        print(row)

python code snippet end

Handling Different Formats

python code snippet start

import csv

# Tab-separated values
with open('data.tsv', 'w', newline='') as file:
    writer = csv.writer(file, delimiter='\t')
    writer.writerow(['Name', 'Score', 'Grade'])
    writer.writerow(['Alice', 95, 'A'])

# Custom quote handling
with open('quotes.csv', 'w', newline='') as file:
    writer = csv.writer(file, quoting=csv.QUOTE_ALL)  # Quote everything
    writer.writerow(['Name', 'Comment'])
    writer.writerow(['Bob', 'Said "Hello, World!"'])

# Different quoting options:
# csv.QUOTE_MINIMAL - Only when necessary (default)
# csv.QUOTE_ALL - Quote all fields
# csv.QUOTE_NONNUMERIC - Quote non-numeric fields
# csv.QUOTE_NONE - Never quote (escape instead)

python code snippet end

Practical Data Processing

Sales Data Analysis

python code snippet start

import csv
from collections import defaultdict

def analyze_sales_data(filename):
    """Analyze sales data from CSV file"""
    sales_by_region = defaultdict(float)
    product_totals = defaultdict(int)
    
    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        
        for row in reader:
            region = row['region']
            product = row['product']
            quantity = int(row['quantity'])
            price = float(row['price'])
            
            sales_by_region[region] += quantity * price
            product_totals[product] += quantity
    
    return dict(sales_by_region), dict(product_totals)

# Usage
# sales.csv format: region,product,quantity,price
sales_by_region, product_totals = analyze_sales_data('sales.csv')
print("Sales by region:", sales_by_region)
print("Product quantities:", product_totals)

python code snippet end

Data Cleaning and Transformation

python code snippet start

import csv

def clean_customer_data(input_file, output_file):
    """Clean and standardize customer data"""
    
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        reader = csv.DictReader(infile)
        
        # Define output fields
        fieldnames = ['customer_id', 'name', 'email', 'phone', 'city']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for row in reader:
            # Clean and transform data
            cleaned_row = {
                'customer_id': row['id'].strip(),
                'name': row['full_name'].title().strip(),
                'email': row['email_address'].lower().strip(),
                'phone': ''.join(filter(str.isdigit, row['phone_number'])),
                'city': row['city'].title().strip()
            }
            
            # Validation
            if '@' in cleaned_row['email'] and len(cleaned_row['phone']) >= 10:
                writer.writerow(cleaned_row)
            else:
                print(f"Skipped invalid row: {row['id']}")

# Usage
clean_customer_data('raw_customers.csv', 'clean_customers.csv')

python code snippet end

CSV to JSON Conversion

python code snippet start

import csv
import json

def csv_to_json(csv_file, json_file):
    """Convert CSV file to JSON"""
    data = []
    
    with open(csv_file, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            # Convert numeric strings to numbers
            for key, value in row.items():
                if value.isdigit():
                    row[key] = int(value)
                elif value.replace('.', '').isdigit():
                    row[key] = float(value)
            
            data.append(row)
    
    with open(json_file, 'w') as file:
        json.dump(data, file, indent=2)

# Usage
csv_to_json('products.csv', 'products.json')

python code snippet end

Error Handling and Edge Cases

python code snippet start

import csv

def robust_csv_reader(filename):
    """Read CSV with proper error handling"""
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            # Detect dialect automatically
            sample = file.read(1024)
            file.seek(0)
            
            try:
                dialect = csv.Sniffer().sniff(sample)
                reader = csv.DictReader(file, dialect=dialect)
            except csv.Error:
                # Fall back to default dialect
                reader = csv.DictReader(file)
            
            rows = []
            for line_num, row in enumerate(reader, 1):
                try:
                    # Validate required fields
                    if not row.get('name') or not row.get('email'):
                        print(f"Warning: Missing data in line {line_num}")
                        continue
                    
                    rows.append(row)
                
                except Exception as e:
                    print(f"Error processing line {line_num}: {e}")
                    continue
            
            return rows
    
    except FileNotFoundError:
        print(f"File {filename} not found")
        return []
    except UnicodeDecodeError:
        print(f"Encoding error in {filename}")
        return []

# Usage
data = robust_csv_reader('messy_data.csv')

python code snippet end

Performance Tips

python code snippet start

import csv

# For large files, process row by row (memory efficient)
def process_large_csv(filename):
    """Process large CSV files efficiently"""
    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        
        for row in reader:
            # Process one row at a time
            # Don't load entire file into memory
            yield process_row(row)

# Batch processing for database inserts
def batch_insert_csv(filename, batch_size=1000):
    """Insert CSV data in batches"""
    batch = []
    
    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        
        for row in reader:
            batch.append(row)
            
            if len(batch) >= batch_size:
                # Insert batch to database
                insert_batch(batch)
                batch = []
        
        # Insert remaining rows
        if batch:
            insert_batch(batch)

python code snippet end

Best Practices

python code snippet start

# Always use newline='' when writing CSV files
with open('data.csv', 'w', newline='') as file:
    writer = csv.writer(file)

# Use DictReader/DictWriter for better readability
# Good
with open('data.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        print(row['name'])

# Less readable
with open('data.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        print(row[0])  # Which column is this?

# Handle encoding explicitly
with open('data.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

# Validate data types
def safe_int(value, default=0):
    try:
        return int(value)
    except (ValueError, TypeError):
        return default

age = safe_int(row['age'])

python code snippet end

The csv module makes working with tabular data straightforward while handling the complexities of CSV format edge cases automatically.

CSV operations integrate seamlessly with pathlib for file handling and JSON conversion for comprehensive data processing workflows.

Reference: Python CSV Module Documentation