#!/usr/bin/env perl # # unutf8 This script translates UTF8 encodings in text to something # useful. # # usage: # unutf8 [-format | -width len | -help | -Version] # # Revision History # 1.0 Initial revision. 140908 # 1.1 Added the -format and -width options. 151208 # 1.2 Added licensing info. 180531 # # Written by Wayne Morrison, 140908. # # Various pre-history versions of this script have been written # over the previous 15 years. Some in Perl, some in Ruby. Maybe # even some in other languages. # # Copyright 2014 Wayne Morrison # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use strict; use Getopt::Long qw(:config no_ignore_case_always); # # Version information. # my $NAME = "unutf8"; my $VERS = "$NAME version: 1.2"; ############################################################################## # # Options fields. # my %opts = (); # Options. # # Command line arguments. # my @opts = ( 'format', # Format the line after translation. 'width=i', # Line width for formatting. 'help', # Give a help message. 'Version', # Display the program version. ); my $format = 0; # Format-output flag. my $width = 0; # Line width. my $DEFAULT_WIDTH = 72; # Default width to use. my $fmt = 'fmt'; # Format command to execute. ############################################################################## main(); exit(); #------------------------------------------------------------------------ # Routine: main() # # Purpose: Do everything. # sub main { my $lines = ''; # Input lines string. $| = 1; # # Munch on the options and arguments. # optsandargs(); # # Get the input. # $lines = getdata(); # # Get rid of those nasty, nasty UTF8 characters. # cvtutf8($lines); } #---------------------------------------------------------------------- # Routine: optsandargs() # # Purpose: Parse the command line for options and arguments. # sub optsandargs { # # Parse the options. # GetOptions(\%opts,@opts) || usage(); # # Check for some immediate-action options. # usage() if(defined($opts{'help'})); version() if(defined($opts{'Version'})); # # Check for the command-specific options. # $format = $opts{'format'}; # # If the -width option was given, we'll get the value and turn # on the format flag. # If not, we'll set the default width and *not* touch the format # flag. # if(defined($opts{'width'})) { $width = $opts{'width'}; $format = 1; } else { $width = $DEFAULT_WIDTH; } } #------------------------------------------------------------------------ # Routine: getdata() # # Purpose: Get the input, either from a file named on the command line # or from stdin. # sub getdata { my @lines = (); # Input lines array. my $lines = ''; # Input lines string. # # Get the input. If a file was given on the command line, we'll # use it. If not, we'll read from stdin. # if(@ARGV > 0) { if(open(INFILE, "< $ARGV[0]") == 0) { print STDERR "unable to open input file \"$ARGV[0]\"\n"; exit(1); } @lines = ; close(INFILE); } else { @lines = ; } # # Put the input into one long string. # $lines = join('', @lines); # # Ensure some input was given. # if($lines eq '') { print STDERR "no input to un-utf8\n"; exit(2); } # # Return the input string. # return($lines); } #------------------------------------------------------------------------ # Routine: cvtutf8() # # Purpose: Perform all the conversions. # # If the -format option was given, we'll format the # converted line. # sub cvtutf8 { my $lines = shift; # Lines to convert. # # Any lines ending with an equals sign will drop the equals. # $lines =~ s/=\n//gm; # # Do some UTF-8 transforms, given in a variety of representations. # $lines =~ s/=20//ig; $lines =~ s/=22/"/ig; $lines =~ s/=3D/=/ig; $lines =~ s/=40/@/ig; $lines =~ s/=46/F/ig; $lines =~ s/=3F/?/ig; $lines =~ s/=5F/_/ig; $lines =~ s/=85/.../ig; $lines =~ s/=92/'/ig; $lines =~ s/=93/"/ig; $lines =~ s/=94/"/ig; $lines =~ s/=97/\~/ig; $lines =~ s/=C2=A0/ /ig; $lines =~ s/=E2=80=94/--/ig; $lines =~ s/=E2=80=9C/"/ig; $lines =~ s/=E2=80=9D/"/ig; $lines =~ s/=E2=80=98/'/ig; $lines =~ s/=E2=80=99/'/ig; $lines =~ s/=E2=80=A6/.../ig; $lines =~ s/M-\^E/.../ig; $lines =~ s/M-\^R/'/ig; $lines =~ s/M-\^S/"/ig; $lines =~ s/M-\^T/"/ig; $lines =~ s/M-\^W/\~/ig; $lines =~ s/\xe2\x80\x94/--/igs; $lines =~ s/\xe2\x80\x98/'/igs; $lines =~ s/\xe2\x80\x99/'/igs; $lines =~ s/\xe2\x80\x9c/"/igs; $lines =~ s/\xe2\x80\x9d/"/igs; $lines =~ s/\xe2\x80\xa6/!/igs; # $lines =~ s/M-bM-\^\@M-\^Y/'/igs; # $lines =~ s/\x92/'/ig; # $lines =~ s/\x93/"/ig; # $lines =~ s/\x94/"/ig; # # Format the translated lines so they're broken into shorter lines. # if($format) { $lines = `echo \"$lines\" | $fmt $width`; } print "$lines\n"; } #---------------------------------------------------------------------- # Routine: version() # # Purpose: Print the version number(s) and exit. # sub version { print STDERR "$VERS\n"; exit(0); } #---------------------------------------------------------------------- # Routine: usage() # # Purpose: Give usage message and exit. # sub usage { print STDERR "usage: unutf8 [options]\n"; print STDERR "\n"; print STDERR "\twhere [options] are:\n"; print STDERR "\t\t-format\n"; print STDERR "\t\t-width linewidth\n"; print STDERR "\t\t-help\n"; print STDERR "\t\t-Version\n"; exit(0); } ############################################################################## =pod =head1 NAME B - Translate UTF8 encodings in text to legible text =head1 SYNOPSIS unutf8 [options] =head1 DESCRIPTION B is a filter that translates UTF8 encodings in text to legible text. The data to be translated will be taken either from a file (if one was named on the command line) or from standard input. In some cases, the input might have to be first passed through the B command. This script does not convert all UTF8 characters. It translates those commonly encountered by the author. Character translations are taken from B. =head1 OPTIONS B takes the following options: =over 4 =item I<-format> Format the translated lines so they're broken into a lines of a set length. The actual effect is to pass the translated lines to the B command. If the I<-width> option is not given, then a default line width of 72 characters will be used. =item I<-width linewidth> Specify a rough line width to pass to the B command. This value will be passed to the B command. This option implies the I<-format> option. =item I<-Version> Display the version information for B. =item I<-help> Display a help message. =back =head1 AUTHOR Wayne Morrison, wayne@waynemorrison.com =head1 LICENSE Copyright 2014 Wayne Morrison Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. =head1 SEE ALSO B =cut