Linux Text Processing Guide
Text processing is a core skill for Linux system administration and development. This guide covers powerful tools for searching, filtering, transforming, and analyzing text data.
Overview: Text Processing Tools
| Tool | Purpose | Best For |
|---|---|---|
grep |
Search patterns | Finding text in files |
sed |
Stream editor | Find and replace, text transformation |
awk |
Pattern scanning | Column-based processing, reports |
cut |
Extract columns | Extracting fields from delimited files |
sort |
Sort lines | Ordering text data |
uniq |
Remove duplicates | Finding unique/duplicate lines |
tr |
Translate characters | Case conversion, character replacement |
wc |
Count lines/words | Text statistics |
Searching Text with grep
Basic grep Usage
# Search for pattern in file
grep "error" logfile.txt
# Case-insensitive search
grep -i "warning" logfile.txt
# Search multiple files
grep "TODO" *.py
# Recursive search
grep -r "function" /project/src/
# Show line numbers
grep -n "import" script.py
Advanced grep Patterns
# Regular expressions
grep "^Error" logs.txt # Lines starting with "Error"
grep "failed$" logs.txt # Lines ending with "failed"
grep "[0-9]{3}" data.txt # Three consecutive digits
grep "user\|admin" access.log # Match "user" OR "admin"
# Extended regex
grep -E "error|warning|critical" system.log
grep -E "[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}" access.log
# Perl-compatible regex
grep -P "(?<=@)\w+" emails.txt # Match after @
grep Output Control
# Show only matching part
grep -o "[0-9]*" file.txt
# Show context lines
grep -C 3 "error" log.txt # 3 lines before and after
grep -B 2 "error" log.txt # 2 lines before
grep -A 5 "error" log.txt # 5 lines after
# Count matches
grep -c "pattern" file.txt
# Show only filenames
grep -l "TODO" *.txt
# Invert match (show lines NOT matching)
grep -v "^#" config.conf
Practical grep Examples
# Find all email addresses
grep -E -o "\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b" file.txt
# Find IP addresses
grep -E -o "([0-9]{1,3}\.){3}[0-9]{1,3}" access.log
# Find errors in logs
grep -i "error\|fail\|critical" /var/log/syslog
# Search excluding patterns
grep "error" logs.txt | grep -v "ignored"
# Multiple patterns from file
grep -f patterns.txt data.txt
Stream Editing with sed
Basic sed Operations
# Substitute first occurrence
sed 's/old/new/' file.txt
# Substitute all occurrences
sed 's/old/new/g' file.txt
# Substitute and save to file
sed -i 's/old/new/g' file.txt
# Create backup when editing
sed -i.bak 's/old/new/g' file.txt
# Substitute on specific line
sed '3s/old/new/' file.txt
Advanced sed Substitution
# Case-insensitive replace
sed 's/pattern/replacement/I' file.txt
# Replace only nth occurrence
sed 's/old/new/2' file.txt # Replace 2nd occurrence
# Replace in line range
sed '10,20s/old/new/g' file.txt
# Replace using regex groups
sed 's/\([0-9]*\)-\([0-9]*\)/\2-\1/' file.txt
# Multiple substitutions
sed 's/cat/dog/g; s/red/blue/g' file.txt
Delete and Insert Lines
# Delete specific line
sed '5d' file.txt
# Delete line range
sed '10,20d' file.txt
# Delete last line
sed '$d' file.txt
# Delete lines matching pattern
sed '/pattern/d' file.txt
# Delete empty lines
sed '/^$/d' file.txt
# Insert line before match
sed '/pattern/i\New line to insert' file.txt
# Append line after match
sed '/pattern/a\New line to append' file.txt
sed Practical Examples
# Remove comments and empty lines
sed '/^#/d; /^$/d' config.conf
# Add line numbers
sed = file.txt | sed 'N;s/\n/ /'
# Replace in specific column
sed 's/^\([^ ]* [^ ]*\) old /\1 new /' file.txt
# Convert DOS to Unix line endings
sed 's/\r$//' dosfile.txt > unixfile.txt
# Extract specific lines
sed -n '10,20p' file.txt # Print lines 10-20
Text Processing with awk
Basic awk Usage
# Print specific column
awk '{print $1}' file.txt # First column
awk '{print $2, $5}' file.txt # 2nd and 5th columns
awk '{print $NF}' file.txt # Last column
# Custom delimiter
awk -F ',' '{print $1}' data.csv
# Print with custom format
awk '{print "Name: " $1 ", Age: " $2}' data.txt
awk Patterns and Conditions
# Pattern matching
awk '/error/' logfile.txt # Lines containing "error"
awk '/^[0-9]/' data.txt # Lines starting with digit
# Conditional printing
awk '$3 > 100' data.txt # 3rd column > 100
awk '$1 == "admin"' users.txt # 1st column equals "admin"
awk 'length($0) > 80' file.txt # Lines longer than 80 chars
# Multiple conditions
awk '$3 > 50 && $3 < 100' data.txt
awk '$1 == "error" || $1 == "warning"' logs.txt
awk Calculations
# Sum column
awk '{sum += $3} END {print sum}' data.txt
# Average
awk '{sum += $1} END {print sum/NR}' numbers.txt
# Count lines meeting condition
awk '$3 > 100 {count++} END {print count}' data.txt
# Min and max
awk 'NR==1{max=$1; min=$1} $1>max{max=$1} $1<min{min=$1} END{print "Max:", max, "Min:", min}' data.txt
awk Practical Examples
# CSV processing
awk -F',' '{print $1, $3}' data.csv
# Generate reports
awk '{total+=$5} {count++} END {print "Total:", total, "Average:", total/count}' sales.txt
# Format output
awk '{printf "%-20s %10.2f\n", $1, $2}' data.txt
# Process logs
awk '$9 >= 400 {print $1, $7, $9}' access.log # HTTP errors
# Join fields
awk '{print $1 "-" $2 "-" $3}' data.txt
Extracting Data with cut
Basic cut Usage
# Extract by character position
cut -c1-5 file.txt # Characters 1-5
cut -c10- file.txt # From character 10 to end
# Extract by field (tab-delimited)
cut -f1 file.txt # First field
cut -f1,3 file.txt # Fields 1 and 3
# Custom delimiter
cut -d',' -f2 data.csv # 2nd field in CSV
cut -d':' -f1,6 /etc/passwd # Username and home dir
cut Practical Examples
# Extract usernames from /etc/passwd
cut -d':' -f1 /etc/passwd
# Get IP addresses from log
cut -d' ' -f1 access.log | sort | uniq
# Extract email domain
cut -d'@' -f2 emails.txt
# Get specific columns from CSV
cut -d',' -f2,4,6 data.csv
# Extract time from timestamps
cut -c12-19 log.txt
Sorting with sort
Basic Sorting
# Alphabetical sort
sort file.txt
# Reverse sort
sort -r file.txt
# Numeric sort
sort -n numbers.txt
# Sort by specific column
sort -k2 data.txt # Sort by 2nd column
# Unique sort
sort -u file.txt
Advanced Sorting
# Custom delimiter
sort -t',' -k2 data.csv
# Multiple sort keys
sort -k1,1 -k2,2n file.txt # Sort by col1, then by col2 numerically
# Human-readable numbers
sort -h sizes.txt # Sorts 1K, 2M, 3G correctly
# Month sort
sort -M months.txt # Jan, Feb, Mar...
# Random sort
sort -R file.txt
# Case-insensitive
sort -f file.txt
Finding Unique Lines with uniq
Basic uniq Usage
# Remove consecutive duplicates (requires sorted input)
sort file.txt | uniq
# Count occurrences
sort file.txt | uniq -c
# Show only duplicates
sort file.txt | uniq -d
# Show only unique lines
sort file.txt | uniq -u
# Ignore case
sort file.txt | uniq -i
uniq Practical Examples
# Count unique IP addresses
cut -d' ' -f1 access.log | sort | uniq -c | sort -rn
# Find duplicate lines
sort data.txt | uniq -d
# Count unique words
tr ' ' '\n' < file.txt | sort | uniq -c | sort -rn
Character Translation with tr
Basic tr Usage
# Lowercase to uppercase
tr 'a-z' 'A-Z' < file.txt
# Delete characters
tr -d '0-9' < file.txt # Remove all digits
# Squeeze repeats
tr -s ' ' < file.txt # Squeeze multiple spaces to one
# Replace characters
tr ',' '\t' < data.csv # Replace commas with tabs
tr Practical Examples
# ROT13 encoding
tr 'A-Za-z' 'N-ZA-Mn-za-m' < secret.txt
# Remove all non-alphanumeric
tr -cd '[:alnum:]' < file.txt
# Convert Windows line endings
tr -d '\r' < dosfile.txt > unixfile.txt
# Remove all whitespace
tr -d '[:space:]' < file.txt
Counting with wc
# Count lines
wc -l file.txt
# Count words
wc -w file.txt
# Count characters
wc -c file.txt
# Count bytes
wc -m file.txt
# All counts
wc file.txt # Lines, words, bytes
# Count files
ls | wc -l
# Count unique lines
sort file.txt | uniq | wc -l
Real-World Examples
Log Analysis
# Top 10 IP addresses
cut -d' ' -f1 access.log | sort | uniq -c | sort -rn | head -10
# Count HTTP status codes
awk '{print $9}' access.log | sort | uniq -c
# Find errors in last hour
grep "$(date -d '1 hour ago' '+%Y-%m-%d %H')" /var/log/syslog | grep -i error
# Extract failed login attempts
grep "Failed password" /var/log/auth.log | awk '{print $11}' | sort | uniq -c
Data Processing
# Calculate average from column
awk '{sum+=$3; count++} END {print sum/count}' data.txt
# Find duplicates in CSV
cut -d',' -f1 data.csv | sort | uniq -d
# Combine and sort multiple files
sort -m file1.txt file2.txt file3.txt
# Remove blank lines and comments
sed '/^#/d; /^$/d' config.conf > config.clean
Text Transformation
# Convert CSV to tab-delimited
tr ',' '\t' < data.csv > data.tsv
# Add line numbers
awk '{print NR": "$0}' file.txt
# Reverse lines
tac file.txt
# Reverse characters in each line
rev file.txt
# Extract URLs from HTML
grep -o 'http[s]\?://[^"]*' page.html
Quick Reference
Common Patterns
# Search and replace in all files
find . -name "*.txt" -exec sed -i 's/old/new/g' {} \;
# Count occurrences of pattern
grep -c "pattern" file.txt
# Extract column and remove duplicates
cut -d',' -f2 data.csv | sort -u
# Sum numbers in file
awk '{s+=$1} END {print s}' numbers.txt
# Find most common lines
sort file.txt | uniq -c | sort -rn | head