CSV Module: Easy CSV File Reading and Writing
TL;DR
The csv module provides csv.reader(), csv.writer(), csv.DictReader(), and csv.DictWriter() for robust CSV file processing with automatic dialect detection and proper handling of quotes, delimiters, and line endings.
Interesting!
The csv module can automatically detect CSV dialects (delimiter, quote style, etc.) from sample data, and it properly handles tricky cases like embedded commas, quotes, and newlines within CSV fields that would break naive string splitting.
Basic CSV Reading
python code snippet start
import csv
# Reading with csv.reader()
with open('data.csv', 'r') as file:
reader = csv.reader(file)
# Skip header row
headers = next(reader)
print(f"Headers: {headers}")
# Process each row
for row in reader:
print(row) # Each row is a list
print(f"Name: {row[0]}, Age: {row[1]}")
# Sample data.csv:
# Name,Age,City
# Alice,25,New York
# Bob,30,London
python code snippet end
DictReader for Named Access
python code snippet start
import csv
# Reading with DictReader (recommended)
with open('employees.csv', 'r') as file:
reader = csv.DictReader(file)
for row in reader:
# Each row is a dictionary
print(f"Name: {row['name']}")
print(f"Department: {row['department']}")
print(f"Salary: ${row['salary']}")
print("---")
# Alternative: specify fieldnames explicitly
with open('data.csv', 'r') as file:
reader = csv.DictReader(file, fieldnames=['name', 'age', 'city'])
next(reader) # Skip header if present
for row in reader:
print(row['name'], row['age'])
python code snippet end
Writing CSV Files
python code snippet start
import csv
# Writing with csv.writer()
data = [
['Name', 'Age', 'City'],
['Alice', 25, 'New York'],
['Bob', 30, 'London'],
['Charlie', 35, 'Tokyo']
]
with open('output.csv', 'w', newline='') as file:
writer = csv.writer(file)
# Write header
writer.writerow(data[0])
# Write data rows
writer.writerows(data[1:])
# Writing individual rows
with open('people.csv', 'w', newline='') as file:
writer = csv.writer(file)
for person in people_data:
writer.writerow([person.name, person.age, person.city])
python code snippet end
DictWriter for Dictionary Data
python code snippet start
import csv
# Writing with DictWriter (recommended)
employees = [
{'name': 'Alice', 'department': 'Engineering', 'salary': 75000},
{'name': 'Bob', 'department': 'Sales', 'salary': 65000},
{'name': 'Charlie', 'department': 'Marketing', 'salary': 70000}
]
with open('employees.csv', 'w', newline='') as file:
fieldnames = ['name', 'department', 'salary']
writer = csv.DictWriter(file, fieldnames=fieldnames)
# Write header
writer.writeheader()
# Write data
for employee in employees:
writer.writerow(employee)
# Or write all at once
# writer.writerows(employees)
python code snippet end
Custom Dialects and Formatting
python code snippet start
import csv
# Custom dialect
class MyDialect(csv.excel):
delimiter = '|'
quotechar = '"'
doublequote = True
skipinitialspace = True
lineterminator = '\n'
quoting = csv.QUOTE_MINIMAL
# Register custom dialect
csv.register_dialect('pipes', MyDialect)
# Use custom dialect
with open('pipe_data.csv', 'w', newline='') as file:
writer = csv.writer(file, dialect='pipes')
writer.writerow(['Name', 'Age', 'City'])
writer.writerow(['Alice', 25, 'New York'])
# Reading with custom dialect
with open('pipe_data.csv', 'r') as file:
reader = csv.reader(file, dialect='pipes')
for row in reader:
print(row)
python code snippet end
Handling Different Formats
python code snippet start
import csv
# Tab-separated values
with open('data.tsv', 'w', newline='') as file:
writer = csv.writer(file, delimiter='\t')
writer.writerow(['Name', 'Score', 'Grade'])
writer.writerow(['Alice', 95, 'A'])
# Custom quote handling
with open('quotes.csv', 'w', newline='') as file:
writer = csv.writer(file, quoting=csv.QUOTE_ALL) # Quote everything
writer.writerow(['Name', 'Comment'])
writer.writerow(['Bob', 'Said "Hello, World!"'])
# Different quoting options:
# csv.QUOTE_MINIMAL - Only when necessary (default)
# csv.QUOTE_ALL - Quote all fields
# csv.QUOTE_NONNUMERIC - Quote non-numeric fields
# csv.QUOTE_NONE - Never quote (escape instead)
python code snippet end
Practical Data Processing
Sales Data Analysis
python code snippet start
import csv
from collections import defaultdict
def analyze_sales_data(filename):
"""Analyze sales data from CSV file"""
sales_by_region = defaultdict(float)
product_totals = defaultdict(int)
with open(filename, 'r') as file:
reader = csv.DictReader(file)
for row in reader:
region = row['region']
product = row['product']
quantity = int(row['quantity'])
price = float(row['price'])
sales_by_region[region] += quantity * price
product_totals[product] += quantity
return dict(sales_by_region), dict(product_totals)
# Usage
# sales.csv format: region,product,quantity,price
sales_by_region, product_totals = analyze_sales_data('sales.csv')
print("Sales by region:", sales_by_region)
print("Product quantities:", product_totals)
python code snippet end
Data Cleaning and Transformation
python code snippet start
import csv
def clean_customer_data(input_file, output_file):
"""Clean and standardize customer data"""
with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
reader = csv.DictReader(infile)
# Define output fields
fieldnames = ['customer_id', 'name', 'email', 'phone', 'city']
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
for row in reader:
# Clean and transform data
cleaned_row = {
'customer_id': row['id'].strip(),
'name': row['full_name'].title().strip(),
'email': row['email_address'].lower().strip(),
'phone': ''.join(filter(str.isdigit, row['phone_number'])),
'city': row['city'].title().strip()
}
# Validation
if '@' in cleaned_row['email'] and len(cleaned_row['phone']) >= 10:
writer.writerow(cleaned_row)
else:
print(f"Skipped invalid row: {row['id']}")
# Usage
clean_customer_data('raw_customers.csv', 'clean_customers.csv')
python code snippet end
CSV to JSON Conversion
python code snippet start
import csv
import json
def csv_to_json(csv_file, json_file):
"""Convert CSV file to JSON"""
data = []
with open(csv_file, 'r') as file:
reader = csv.DictReader(file)
for row in reader:
# Convert numeric strings to numbers
for key, value in row.items():
if value.isdigit():
row[key] = int(value)
elif value.replace('.', '').isdigit():
row[key] = float(value)
data.append(row)
with open(json_file, 'w') as file:
json.dump(data, file, indent=2)
# Usage
csv_to_json('products.csv', 'products.json')
python code snippet end
Error Handling and Edge Cases
python code snippet start
import csv
def robust_csv_reader(filename):
"""Read CSV with proper error handling"""
try:
with open(filename, 'r', encoding='utf-8') as file:
# Detect dialect automatically
sample = file.read(1024)
file.seek(0)
try:
dialect = csv.Sniffer().sniff(sample)
reader = csv.DictReader(file, dialect=dialect)
except csv.Error:
# Fall back to default dialect
reader = csv.DictReader(file)
rows = []
for line_num, row in enumerate(reader, 1):
try:
# Validate required fields
if not row.get('name') or not row.get('email'):
print(f"Warning: Missing data in line {line_num}")
continue
rows.append(row)
except Exception as e:
print(f"Error processing line {line_num}: {e}")
continue
return rows
except FileNotFoundError:
print(f"File {filename} not found")
return []
except UnicodeDecodeError:
print(f"Encoding error in {filename}")
return []
# Usage
data = robust_csv_reader('messy_data.csv')
python code snippet end
Performance Tips
python code snippet start
import csv
# For large files, process row by row (memory efficient)
def process_large_csv(filename):
"""Process large CSV files efficiently"""
with open(filename, 'r') as file:
reader = csv.DictReader(file)
for row in reader:
# Process one row at a time
# Don't load entire file into memory
yield process_row(row)
# Batch processing for database inserts
def batch_insert_csv(filename, batch_size=1000):
"""Insert CSV data in batches"""
batch = []
with open(filename, 'r') as file:
reader = csv.DictReader(file)
for row in reader:
batch.append(row)
if len(batch) >= batch_size:
# Insert batch to database
insert_batch(batch)
batch = []
# Insert remaining rows
if batch:
insert_batch(batch)
python code snippet end
Best Practices
python code snippet start
# Always use newline='' when writing CSV files
with open('data.csv', 'w', newline='') as file:
writer = csv.writer(file)
# Use DictReader/DictWriter for better readability
# Good
with open('data.csv', 'r') as file:
reader = csv.DictReader(file)
for row in reader:
print(row['name'])
# Less readable
with open('data.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
print(row[0]) # Which column is this?
# Handle encoding explicitly
with open('data.csv', 'r', encoding='utf-8') as file:
reader = csv.DictReader(file)
# Validate data types
def safe_int(value, default=0):
try:
return int(value)
except (ValueError, TypeError):
return default
age = safe_int(row['age'])
python code snippet end
The csv module makes working with tabular data straightforward while handling the complexities of CSV format edge cases automatically.
CSV operations integrate seamlessly with pathlib for file handling and JSON conversion for comprehensive data processing workflows.
Reference: Python CSV Module Documentation