#!/usr/bin/perl -w # Rough hack to convert MS Word files to something readable. # This is just slightly better than "strings"... # Copyright © 2000 Jamie Zawinski , all rights reserved. # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # Created: 4-Jul-00. require 5; use diagnostics; use strict; use Text::Wrap; my $progname = $0; $progname =~ s@.*/@@g; my $version = q{ $Revision: 1.1 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/; sub undoc { my $doc = ""; while () { $doc .= $_; } $_ = $doc; # convert line endings s/\r\n?/\n/gs; # delete any line with a NUL in it s/^.*[\000].*$//gm; # map some Windows characters to ASCII s/\222/\'/g; s/\205/ --/g; s/\226/-/g; s/\227/-- /g; s/\223/``/g; s/\224/''/g; # convert any non-Latin1 characters to paragraph breaks. s/[\000-\010\013-\037\200-\240]+/\n\n/g; # strip trailing whitespace s/[ \t]+$//gm; # squeeze multiple blank lines s/\n\n\n+/\n\n/gs; foreach (split(/\n\n/,$_)) { # Figure out what the indentation of the paragraph is... my ($head) = m/^([ \t]*)/; # Also note numbered paragraphs: "\t 3. \t ..." my ($head2) = m/^([ \t]*([0-9]+|[a-zA-Z])[\).]*[ \t]+)/; if ($head2) { $head = $head2; $head =~ s/[0-9a-zA-Z\).]//g; } # Fix a mistake the above makes with paragraphs beginning with "A ...". $head =~ s/^ $//; # Wrap the lines in the paragraph and print it. print wrap ("", $head, $_); print "\n\n"; } } sub main { if ($#ARGV < 0) { undoc; } else { foreach (@ARGV) { die "usage: $progname [ files... ]\n" if (m/^-/); open (STDIN, "<$_") || die "$progname: reading $_: $!\n"; undoc; close STDIN; } } } main; exit 0;