Linux Text Processing Guide

Text processing is a core skill for Linux system administration and development. This guide covers powerful tools for searching, filtering, transforming, and analyzing text data.

Overview: Text Processing Tools

Tool	Purpose	Best For
`grep`	Search patterns	Finding text in files
`sed`	Stream editor	Find and replace, text transformation
`awk`	Pattern scanning	Column-based processing, reports
`cut`	Extract columns	Extracting fields from delimited files
`sort`	Sort lines	Ordering text data
`uniq`	Remove duplicates	Finding unique/duplicate lines
`tr`	Translate characters	Case conversion, character replacement
`wc`	Count lines/words	Text statistics

Searching Text with grep

Basic grep Usage

# Search for pattern in file
grep "error" logfile.txt

# Case-insensitive search
grep -i "warning" logfile.txt

# Search multiple files
grep "TODO" *.py

# Recursive search
grep -r "function" /project/src/

# Show line numbers
grep -n "import" script.py

Advanced grep Patterns

# Regular expressions
grep "^Error" logs.txt          # Lines starting with "Error"
grep "failed$" logs.txt          # Lines ending with "failed"
grep "[0-9]{3}" data.txt         # Three consecutive digits
grep "user\|admin" access.log    # Match "user" OR "admin"

# Extended regex
grep -E "error|warning|critical" system.log
grep -E "[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}" access.log

# Perl-compatible regex
grep -P "(?<=@)\w+" emails.txt   # Match after @

grep Output Control

# Show only matching part
grep -o "[0-9]*" file.txt

# Show context lines
grep -C 3 "error" log.txt        # 3 lines before and after
grep -B 2 "error" log.txt        # 2 lines before
grep -A 5 "error" log.txt        # 5 lines after

# Count matches
grep -c "pattern" file.txt

# Show only filenames
grep -l "TODO" *.txt

# Invert match (show lines NOT matching)
grep -v "^#" config.conf

Practical grep Examples

# Find all email addresses
grep -E -o "\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b" file.txt

# Find IP addresses
grep -E -o "([0-9]{1,3}\.){3}[0-9]{1,3}" access.log

# Find errors in logs
grep -i "error\|fail\|critical" /var/log/syslog

# Search excluding patterns
grep "error" logs.txt | grep -v "ignored"

# Multiple patterns from file
grep -f patterns.txt data.txt

Stream Editing with sed

Basic sed Operations

# Substitute first occurrence
sed 's/old/new/' file.txt

# Substitute all occurrences
sed 's/old/new/g' file.txt

# Substitute and save to file
sed -i 's/old/new/g' file.txt

# Create backup when editing
sed -i.bak 's/old/new/g' file.txt

# Substitute on specific line
sed '3s/old/new/' file.txt

Advanced sed Substitution

# Case-insensitive replace
sed 's/pattern/replacement/I' file.txt

# Replace only nth occurrence
sed 's/old/new/2' file.txt       # Replace 2nd occurrence

# Replace in line range
sed '10,20s/old/new/g' file.txt

# Replace using regex groups
sed 's/\([0-9]*\)-\([0-9]*\)/\2-\1/' file.txt

# Multiple substitutions
sed 's/cat/dog/g; s/red/blue/g' file.txt

Delete and Insert Lines

# Delete specific line
sed '5d' file.txt

# Delete line range
sed '10,20d' file.txt

# Delete last line
sed '$d' file.txt

# Delete lines matching pattern
sed '/pattern/d' file.txt

# Delete empty lines
sed '/^$/d' file.txt

# Insert line before match
sed '/pattern/i\New line to insert' file.txt

# Append line after match
sed '/pattern/a\New line to append' file.txt

sed Practical Examples

# Remove comments and empty lines
sed '/^#/d; /^$/d' config.conf

# Add line numbers
sed = file.txt | sed 'N;s/\n/  /'

# Replace in specific column
sed 's/^\([^ ]* [^ ]*\) old /\1 new /' file.txt

# Convert DOS to Unix line endings
sed 's/\r$//' dosfile.txt > unixfile.txt

# Extract specific lines
sed -n '10,20p' file.txt         # Print lines 10-20

Text Processing with awk

Basic awk Usage

# Print specific column
awk '{print $1}' file.txt        # First column
awk '{print $2, $5}' file.txt    # 2nd and 5th columns
awk '{print $NF}' file.txt       # Last column

# Custom delimiter
awk -F ',' '{print $1}' data.csv

# Print with custom format
awk '{print "Name: " $1 ", Age: " $2}' data.txt

awk Patterns and Conditions

# Pattern matching
awk '/error/' logfile.txt        # Lines containing "error"
awk '/^[0-9]/' data.txt          # Lines starting with digit

# Conditional printing
awk '$3 > 100' data.txt          # 3rd column > 100
awk '$1 == "admin"' users.txt    # 1st column equals "admin"
awk 'length($0) > 80' file.txt   # Lines longer than 80 chars

# Multiple conditions
awk '$3 > 50 && $3 < 100' data.txt
awk '$1 == "error" || $1 == "warning"' logs.txt

awk Calculations

# Sum column
awk '{sum += $3} END {print sum}' data.txt

# Average
awk '{sum += $1} END {print sum/NR}' numbers.txt

# Count lines meeting condition
awk '$3 > 100 {count++} END {print count}' data.txt

# Min and max
awk 'NR==1{max=$1; min=$1} $1>max{max=$1} $1<min{min=$1} END{print "Max:", max, "Min:", min}' data.txt

awk Practical Examples

# CSV processing
awk -F',' '{print $1, $3}' data.csv

# Generate reports
awk '{total+=$5} {count++} END {print "Total:", total, "Average:", total/count}' sales.txt

# Format output
awk '{printf "%-20s %10.2f\n", $1, $2}' data.txt

# Process logs
awk '$9 >= 400 {print $1, $7, $9}' access.log  # HTTP errors

# Join fields
awk '{print $1 "-" $2 "-" $3}' data.txt

Extracting Data with cut

Basic cut Usage

# Extract by character position
cut -c1-5 file.txt              # Characters 1-5
cut -c10- file.txt              # From character 10 to end

# Extract by field (tab-delimited)
cut -f1 file.txt                # First field
cut -f1,3 file.txt              # Fields 1 and 3

# Custom delimiter
cut -d',' -f2 data.csv          # 2nd field in CSV
cut -d':' -f1,6 /etc/passwd     # Username and home dir

cut Practical Examples

# Extract usernames from /etc/passwd
cut -d':' -f1 /etc/passwd

# Get IP addresses from log
cut -d' ' -f1 access.log | sort | uniq

# Extract email domain
cut -d'@' -f2 emails.txt

# Get specific columns from CSV
cut -d',' -f2,4,6 data.csv

# Extract time from timestamps
cut -c12-19 log.txt

Sorting with sort

Basic Sorting

# Alphabetical sort
sort file.txt

# Reverse sort
sort -r file.txt

# Numeric sort
sort -n numbers.txt

# Sort by specific column
sort -k2 data.txt               # Sort by 2nd column

# Unique sort
sort -u file.txt

Advanced Sorting

# Custom delimiter
sort -t',' -k2 data.csv

# Multiple sort keys
sort -k1,1 -k2,2n file.txt      # Sort by col1, then by col2 numerically

# Human-readable numbers
sort -h sizes.txt               # Sorts 1K, 2M, 3G correctly

# Month sort
sort -M months.txt              # Jan, Feb, Mar...

# Random sort
sort -R file.txt

# Case-insensitive
sort -f file.txt

Finding Unique Lines with uniq

Basic uniq Usage

# Remove consecutive duplicates (requires sorted input)
sort file.txt | uniq

# Count occurrences
sort file.txt | uniq -c

# Show only duplicates
sort file.txt | uniq -d

# Show only unique lines
sort file.txt | uniq -u

# Ignore case
sort file.txt | uniq -i

uniq Practical Examples

# Count unique IP addresses
cut -d' ' -f1 access.log | sort | uniq -c | sort -rn

# Find duplicate lines
sort data.txt | uniq -d

# Count unique words
tr ' ' '\n' < file.txt | sort | uniq -c | sort -rn

Character Translation with tr

Basic tr Usage

# Lowercase to uppercase
tr 'a-z' 'A-Z' < file.txt

# Delete characters
tr -d '0-9' < file.txt          # Remove all digits

# Squeeze repeats
tr -s ' ' < file.txt            # Squeeze multiple spaces to one

# Replace characters
tr ',' '\t' < data.csv          # Replace commas with tabs

tr Practical Examples

# ROT13 encoding
tr 'A-Za-z' 'N-ZA-Mn-za-m' < secret.txt

# Remove all non-alphanumeric
tr -cd '[:alnum:]' < file.txt

# Convert Windows line endings
tr -d '\r' < dosfile.txt > unixfile.txt

# Remove all whitespace
tr -d '[:space:]' < file.txt

Counting with wc

# Count lines
wc -l file.txt

# Count words
wc -w file.txt

# Count characters
wc -c file.txt

# Count bytes
wc -m file.txt

# All counts
wc file.txt                     # Lines, words, bytes

# Count files
ls | wc -l

# Count unique lines
sort file.txt | uniq | wc -l

Real-World Examples

Log Analysis

# Top 10 IP addresses
cut -d' ' -f1 access.log | sort | uniq -c | sort -rn | head -10

# Count HTTP status codes
awk '{print $9}' access.log | sort | uniq -c

# Find errors in last hour
grep "$(date -d '1 hour ago' '+%Y-%m-%d %H')" /var/log/syslog | grep -i error

# Extract failed login attempts
grep "Failed password" /var/log/auth.log | awk '{print $11}' | sort | uniq -c

Data Processing

# Calculate average from column
awk '{sum+=$3; count++} END {print sum/count}' data.txt

# Find duplicates in CSV
cut -d',' -f1 data.csv | sort | uniq -d

# Combine and sort multiple files
sort -m file1.txt file2.txt file3.txt

# Remove blank lines and comments
sed '/^#/d; /^$/d' config.conf > config.clean

Text Transformation

# Convert CSV to tab-delimited
tr ',' '\t' < data.csv > data.tsv

# Add line numbers
awk '{print NR": "$0}' file.txt

# Reverse lines
tac file.txt

# Reverse characters in each line
rev file.txt

# Extract URLs from HTML
grep -o 'http[s]\?://[^"]*' page.html

Quick Reference

Common Patterns

# Search and replace in all files
find . -name "*.txt" -exec sed -i 's/old/new/g' {} \;

# Count occurrences of pattern
grep -c "pattern" file.txt

# Extract column and remove duplicates
cut -d',' -f2 data.csv | sort -u

# Sum numbers in file
awk '{s+=$1} END {print s}' numbers.txt

# Find most common lines
sort file.txt | uniq -c | sort -rn | head