#!/usr/bin/perl -w # Copyright © 2015-2021 Jamie Zawinski # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation. No representations are made about the suitability of this # software for any purpose. It is provided "as is" without express or # implied warranty. # # There are several PHP Lint tools out there, none of which seem to do what # I want, and all of which are hard to use. # # I wrote this to help me refucktor a gigantic pile of very old, very shitty # PHP code. My goals with that target codebase were: # # - No code at top level except constants. All code inside functions. # - Declare all globals. # - Detect use of undeclared globals. # - Globals are upper case, just because. # - No implicit variables: don't use extract(). # # Usage: # # - Just run it on PHP files. It prints warnings. # # - If it screws up, make sure that the input code has } in column 0 # if and only if it is terminating a PHP function. What are you, # some kind of savage? # # - Do you think that inlining HTML between "?>" and " echo(), print(), printf(), __*PRINT(). # - SQL: mysql*string() -> mysql*query*(). # # But with a static analyzer, taint-checking can only assume that # all statements are executed in lexicographic order. That's not # useless, but being actually *correct* is the halting problem. # # Created: 16-May-2015. require 5; use diagnostics; use strict; use open ":encoding(utf8)"; my $progname = $0; $progname =~ s@.*/@@g; my ($version) = ('$Revision: 1.11 $' =~ m/\s(\d[.\d]+)\s/s); my $verbose = 0; my $debug_p = 0; # Assume every PHP file can access these globals without declaring them. # my @default_globals = ( '$argv', '$_SERVER', '$_GET', '$_POST', '$_SESSION', '$_COOKIE' ); # Assume these are globals only for top-level code (not inside functions). # E.g., an included file sets them. # my @toplevel_globals = (); sub lint_chunk($$$$$) { my ($file, $lineno, $name, $body, $file_scope) = @_; # Advance $lineno past leading whitespace. if ($body =~ s/^(\s+)//gs) { my $white = $1; $white =~ s/[^\n]//gs; $lineno += length($white); } my $olineno = $lineno; print STDERR "\n" if ($verbose > 1); print STDERR "$file: $lineno: $name...\n" if ($verbose); my %lvalues; # 1 = global or arg; 2 = explicitly assigned. my %rvalues; my %vlineno; # last line we saw it on. my %warned; my %globals; my %fnglobals; print STDERR "$file: $lineno: top-level code exists\n" if ($name eq ' TOPLEVEL'); my $toplevel_p = ($name =~ s/^ //s); print STDERR ('#' x 50) . " $lineno: $name\n$body\n" if ($debug_p); print STDERR "$file: $lineno: $name: extract() called!\n" if ($body =~ m@\b extract \s* \(@sx); foreach my $g (@default_globals) { print STDERR "$file: $lineno: $name: def global lvalue $g\n" if ($verbose > 2); $lvalues{$g} = 1; $globals{$g} = 1; $vlineno{$g} = $lineno; } if ($toplevel_p) { foreach my $g (@toplevel_globals) { print STDERR "$file: $lineno: $name: def toplevel lvalue $g\n" if ($verbose > 2); $lvalues{$g} = 1; $globals{$g} = 1; $vlineno{$g} = $lineno; } } if ($toplevel_p) { # toplevel block # Variables referenced in previous top level blocks are still # considered to be initialized. # foreach my $v (keys (%$file_scope)) { $lvalues{$v} = $file_scope->{$v}; $vlineno{$v} = $lineno; } } else { # function # #### This fails on "function F ($a = array(1,2))" my ($head, $args, $fnbody) = ($body =~ m/^ ( \s* function \s+ # head [a-zA-Z\d_]+ \s* # name \( \s* ( [^[()]* ) \s* \) \s* # args ) ( \{ \s* .* \s* \} ) # fnbody /sx); error ("$file: $lineno: unparsable function: $name\n") unless $fnbody; # Arguments have already been lvalues. foreach my $a (split (/\s*,\s*/, $args)) { $a =~ s/\(.*?\)//gs; # "foo(bar)" $a =~ s/\s*=.*$//gs; # "$a = 0" $a =~ s/^&//; # "&$a" $lvalues{$a} = 1; $vlineno{$a} = $lineno; print STDERR "$file: $lineno: $name: arg lvalue $a\n" if ($verbose > 1); } $head =~ s/[^\n]//gs; # preserve newlines $body = "$head$fnbody"; } # Find all "global $VAR;" statements. # $body =~ s@ ( \b global \s+ ( .*? ) \s* ; ) @{ my ($sec, $globals) = ($1, $2); foreach my $v (split (/\s*[,;]\s*/s, $globals)) { if ($toplevel_p) { print STDERR "$file: $lineno: $name: no-op: toplevel global $v\n"; } else { print STDERR "$file: $lineno: $name: lower case global $v\n" if ($v =~ m/[a-z]/s); $fnglobals{$v} = 1; $globals{$v} = 1; print STDERR "$file: $lineno: $name: fn global $v\n" if ($verbose > 1); } } $sec =~ s/[^\n]//gs; # preserve newlines $sec; }@gsex; # To simplify lvalue/rvalue parsing, let's just munge for loops. # "foreach ($array as $elt)" ==> "$elt = $array"; # "foreach (F(X) as $elt)" ==> "$elt = F(X);" # $body =~ s@ ( \b foreach \s* \( \s* ( \$? [a-zA-Z\d_]+ \s* # f or $array (?: \s* \( .*? \) )? # ( X ) ) \s+ as \s+ ( &? \$ [a-zA-Z\d_]+ ) # &$elt \s* \) ) @{ my ($sec, $array, $elt) = ($1, $2, $3); $sec =~ s/[^\n]//gs; # preserve newlines "$elt = $array$sec"; }@gsex; # "foreach ($array as $k => $v)" ==> "$k = 1; $v = $array"; # $body =~ s@ ( \b foreach \s* \( \s* ( \$ [a-zA-Z\d_]+ ) # $array \s+ as \s+ ( &? \$ [a-zA-Z\d_]+ ) # &$k \s* => \s* ( &? \$ [a-zA-Z\d_]+ ) # &$v \s* \) ) @{ my ($sec, $array, $k, $v) = ($1, $2, $3, $4); $sec =~ s/[^\n]//gs; # preserve newlines "$k = 1; $v = $array$sec"; }@gsex; # Similarly, munge simple arrays: # "$a[$b]" ==> "$b; $a" ($b is an rvalue, $a might be either) # # #### This fails on $a[$b[$c]], etc. # $body =~ s@ ( ( &? \$ [a-zA-Z\d_]+ ) \s* \[ \s* ( &? \$ [a-zA-Z\d_]+ ) \s* \] ) @{ my ($sec, $a, $b) = ($1, $2, $3); $sec =~ s/[^\n]//gs; # preserve newlines "$b; $a$sec" }@gsex; # Munge list "($a, $b) =" to "$a = 1; $b =" # $body =~ s@ ( \b list \s* \( ( .*? ) \) )@{ my ($sec, $vars) = ($1, $2); my \@vars = split(/\s*,\s*/, $vars); my $last = pop(\@vars); $vars = join (" = 1; ", \@vars, "") . $last; $sec =~ s/[^\n]//gs; # preserve newlines "$vars$sec"; }@gsex; # Munge anonymous functions. # "function($a, $b, $c) { ... }" => "$a = 0; $b = 0; $c = 0; { ... }" # $body =~ s@ ( \b function \s* \( ( .*? ) \) )@{ my ($sec, $args) = ($1, $2); my \@args = split(/\s*,\s*/, $args); $args = join (" = 1; ", \@args, ""); $sec =~ s/[^\n]//gs; # preserve newlines "$args$sec"; }@gsex; # Make the third argument of preg_match() be an lvalue. # "preg_match($a, $b, $c)" => "preg_match($a, $b); $c = 1;" # $body =~ s@( ( \b preg_match \s* ) \( ( .*? ) \) )@{ my ($sec, $fn, $args) = ($1, $2, $3); my \@args = split(/\s*,\s*/, $args); my $match = $args[2]; if (defined($match)) { pop \@args; $args = join(', ', \@args); $args = "$fn($args); $match = 1; "; } else { $args = "$fn($args)"; } $sec =~ s/[^\n]//gs; # preserve newlines "$args$sec"; }@gsex; if ($toplevel_p && $name eq 'HEAD') { # HEAD should contain only includes and simple assignment of globals: # # require_once("x"); # $XX = 0; # $XX = array(); # $XX = 60 * 60 * 24; # $body =~ s/^ (\s*) <\? (php \b )? /$1/sx; my $ok = 1; foreach my $expr (split (/\s*;\s*/, $body)) { $expr =~ s/^\s+|\s+$//s; if ($expr =~ m/^(require|include)(_once)?\s/s) { } elsif ($expr =~ m/^ \$ [A-Z_]+ \s* # $X = \s* # = (?: (?: (?: ([\'\"]) .*? \1 ) | # "" \$[A-Z_]+ | # $Y \d+ (?: \. \d+ )? | # 0.0 array \s* \( [^()] + \) # array(Y) ) (?: \s* [-+.*\/] )? # . \s* )+ \s* $/sx) { } else { $ok = 0; } } print STDERR "$file: $lineno: complex expression in $name\n" unless ($ok); } elsif ($toplevel_p && $name eq 'TAIL') { # TAIL should contain nothing but a single call to main() or the like. # $body =~ s/^ ( \s* ) ( <\? )? /$1/sx; $body =~ s/ \s* ( \?> \s* )? $ //sx; if ($body =~ m/^\s*$/s) { } elsif ($body =~ m/^ [_A-Za-z\d]+ \s* \( \s* \) \s* \;? \s* $/sx) { } else { print STDERR "$file: $lineno: complex expression in $name\n"; } } # Find all variables referenced. $body =~ s/(&?\$)/\001$1/gs; foreach my $sec (split (/\001/, $body)) { if ($sec =~ m@ ^ ( &? \$ \{? [a-zA-Z\d_]+ \}? ) \s* # $foo ( \[ [^\]]+ \] )? \s* # ['x'] ( [-+*/%.&|^<>=]* )? @sx) { # = == . .= += etc my ($var, $sub, $op) = ($1, $2, $3); $var =~ s/^&//; # "&$a" $var =~ s/[{}]//gs; $op = '' unless $op; if ($op =~ m~ ( ^[-+*/%.|^]? | ^<< | ^>> ) ^=$ | ^=- ~sx) { $lvalues{$var} = 2; $vlineno{$var} = $lineno; $file_scope->{$var} = 1 if ($toplevel_p); print STDERR "$file: $lineno: $name: lvalue $var ($op)\n" if ($verbose > 1); } elsif ($var eq '$__SHORT_OPEN_TAG') { print STDERR "$file: $lineno: $name: short open tag\n"; } else { if (!$lvalues{$var} && !$globals{$var} && !$warned{$var}) { print STDERR "$file: $lineno: $name: $var uninitialized" . ($op ? " ($op)" : "") . "\n"; $warned{$var} = 1; } $rvalues{$var} = 1; $vlineno{$var} = $lineno; print STDERR "$file: $lineno: $name: rvalue $var\n" if ($verbose > 1); } } $sec =~ s/[^\n]//gs; $lineno += length($sec); } foreach my $v (sort keys %fnglobals) { my $ln = $vlineno{$v} || $lineno; print STDERR "$file: $ln: $name: $v declared global but not referenced\n" unless ($rvalues{$v} || $lvalues{$v}); } if (!$toplevel_p) { foreach my $v (sort keys %lvalues) { my $ln = $vlineno{$v} || $lineno; print STDERR "$file: $ln: $name: $v set but not referenced\n" if (!$globals{$v} && $lvalues{$v} == 2 && !$rvalues{$v}); } # Check for variables that are not global in this function but # that were in some earlier function. # my %vv; foreach my $v (keys %lvalues, keys %rvalues) { $vv{$v} = 1; } foreach my $v (sort keys %vv) { next if $fnglobals{$v}; my $o = $file_scope->{"was_global $v"}; next unless $o; my $ln = $vlineno{$v} || $lineno; print STDERR "$file: $ln: $name: $v not global but was in $o\n"; } foreach my $v (keys %fnglobals) { $file_scope->{"was_global $v"} = $name; } } } sub lint($$) { my ($file, $purify_p) = @_; if ($file =~ m@\.(gif|p?jpe?g|png|ai|psd|e?ps|pdf|ttf)$@si) { print STDERR "$file: not a PHP file\n"; return; } open (my $in, '<', $file) || error ("$file: $!"); print STDERR "$progname: reading $file\n" if ($verbose); local $/ = undef; # read entire file my $body = <$in>; close $in; if (!$body || $body =~ m/^\s*$/s) { print STDERR "$file: empty\n"; return; } elsif ($body !~ m/<\?/s) { print STDERR "$file: not a PHP file\n"; return; } # The final "?>" may be omitted. $body .= '?>' unless ($body =~ m/ \?> \s* $ /sx); # First pass: have highest precedence. Divide PHP code # from string literals. # my @secs = split(/ ( <\? .*? \?> ) /sx, $body); foreach my $sec (@secs) { if ($sec eq '') { # blank } elsif (!$purify_p && $sec =~ m/^\s+$/s) { } elsif ($sec =~ m/^ ( <\?php | <\?= | <\? ) ( .* ) \?> $/sx) { # my $tag = $1; $sec = $2; my $printp = ($tag eq ' HTML my $comm = ''; if ($sec =~ s@ ( [ \t]* (// | \# ) [^\n]* ) $ @@sx) { $comm = $1; # This messes up $lineno, but without it we end up with: # code(); // comment ; print('HTML $comm .= "\n"; } if ($printp) { $sec =~ s/^[ \t]+|[ \t]+$//gs; # Wrap in parens for or $sec = "($sec)" if ($sec =~ m/(;.*?[\$a-z\d]|[,?])/si); $sec = "__IMPLICIT_PRINT($sec);"; } else { # Add terminating semicolon if it's missing. $sec .= ';' unless ($sec =~ m/[\};]\s*$/s); } $sec = '$__SHORT_OPEN_TAG;' . $sec # Fake variable, to warn if ($tag eq ' remain, they are unbalanced. # if ($body =~ m/^ ( .*? ) ( <\? =? | \?> )/sx) { my $nl = $1; my $x = $2; $nl =~ s/[^\n]//gs; $nl = length($nl) + 1; print STDERR "$file: $nl: unbalanced $x\n"; } # Temporarily quote \" \' \$ \\ to make string parsing easier. # $body =~ s@ \\ ( [\"\'\$\\] ) @{ if ($1 eq '"') { "\001"; } elsif ($1 eq "'") { "\002"; } elsif ($1 eq '$') { "\003"; } elsif ($1 eq '\\') { "\004"; } else { $1; } }@gsex; # We blank out the contents of strings so that lint_chunk() can search # for variable usage without having to worry about "\$x" or '$x'. # my $wipe_string_bodies_p = (!$purify_p && $debug_p < 2); # Strings and comments have equal precedence, so we have to process # them in a single pass. # $body =~ s@( ( \" | \' ) .*? \2 | # "..." or '...' ( \# | // ) [^\n]* | # // ... or # ... ( /\* ) .*? \*/ # /* ... */ )@{ my $a = $1; if ($a =~ m%^\"(.*)\"$%s) { # "strings" get variables promoted # "a $b c" -> "a" . $b . "c" $a = $1; my \@s = (); foreach my $s (split(/( (?: \$ \{? [a-zA-Z\d_]+ \}? ) # $foo (?: \s* \[ [^\]]+ \] )? # ['x'] ) /six, $a)) { if ($s eq '') { } elsif ($s =~ m/^\$/s) { # Guess what, "$0" through "$9" are string literals in PHP. Really. # But "${0}" and "$10" are not. $s = '""' if ($s =~ m/^\$\d$/s && !$purify_p); push \@s, $s; } else { if ($wipe_string_bodies_p) { # remaining "string" bodies emptied $s =~ s/[^\n]//gs; $s = "\"$s\""; } push \@s, $s; } } if ($purify_p) { $a = "\"$a\""; } else { $a = join(' . ', \@s); $a = '""' unless $a; } } elsif ($a =~ m%^\'%s) { # 'string' bodies are emptied if ($wipe_string_bodies_p) { $a =~ s/[^\n]//gs; $a = "'$a'"; } } else { # line and block comments are emptied $a =~ s/[^\n]//gs unless ($purify_p); } $a; }@gsex; if ($purify_p) { $body = "[1] =~ m/^ /s && $old->[1] =~ m/^ /s) { $old->[2] .= $sec->[2]; } elsif ($old && $old->[2] =~ m/^\s*$/s) { # previous is blank: merge. $old->[1] = $sec->[1]; # use later section name. $old->[2] .= $sec->[2]; } else { push @secs2, $sec; } } @secs = @secs2; } $secs[$#secs]->[1] = ' TAIL' if ($secs[$#secs]->[1] =~ m/^ /s); $secs[0 ]->[1] = ' HEAD' if ($secs[0 ]->[1] =~ m/^ /s); my %file_scope; # Storage for initialized file-local variables. # Lint each section. foreach my $sec (@secs) { lint_chunk ($file, $sec->[0], $sec->[1], $sec->[2], \%file_scope); } } sub error($) { my ($err) = @_; print STDERR "$progname: $err\n"; exit 1; } sub usage() { print STDERR "usage: $progname [--verbose] [--debug] [--purify] files ...\n"; exit 1; } sub main() { my $purify_p = 0; my @files = (); while ($#ARGV >= 0) { $_ = shift @ARGV; if (m/^--?verbose$/) { $verbose++; } elsif (m/^-v+$/) { $verbose += length($_)-1; } elsif (m/^--?debug$/) { $debug_p++; } elsif (m/^--?purify$/) { $purify_p++; } elsif (m/^-./) { usage; } else { push @files, $_; } } usage unless ($#files >= 0); error ("only one file allowed with --purify") if ($purify_p && @files > 1); foreach (@files) { lint ($_, $purify_p); } } main(); exit 0;