Perl Style Summery... Following the Perl style guide, * all capatial identifiers are reserved for special meaning with Perl. * Functions and local variables are all lowercase. * Module's persistent variables are capitalized. * Identifiers with multiple words are separated by an underscore. * don't use mixed capitals without underscores. (you wouldn't like reading this book without spaces, either) ------------------------------------------------------------------------------- Versions perl -V # all %Config defines perl -V:archname # the architucture to use for installation perl '-V:install.*' # the installation directories perl -MTk -e 'print $Tk::VERSION,"\n"' perl -MCGI -le 'print $CGI::VERSION' NOTE: -l turns on end of line auto handler (chomp input, add \n to output) pmdesc -v -w -s # print all perl modules and version numbers # perl cookbook recipe 12.19 example 12-3 Debugger perl -d:ptkdb myscript.pl ------------------------------------------------------------------------------- Argument handling #----------------------- #Shell Script Usage method #!/usr/bin/perl # # affine_rotate angle [center_x,y] [new_center_x,y] # # Generate a affine matrix, that rotates an image by a specific angle. # #### # # Anthony Thyssen (September 2005) # use strict; use FindBin; my $PROGNAME = $FindBin::Script; # Shell Script Usage Method sub Usage { print STDERR @_, "\n"; @ARGV = ( "$FindBin::Bin/$PROGNAME" ); while( <> ) { next if 1 .. 2; last if /^###/; s/^#$//; s/^# //; print STDERR "Usage: " if 3 .. 3; print STDERR; } exit 10; } #----------------------- # Perl Pod Usage Method #!/usr/bin/perl =head1 NAME program - A program with pod documention =head1 SYNOPSIS program [options] file... Options: -f Switch or Flag -a arg Option with agument -d|--debug Turn on debugging --help Quick Help (synopsis) --manual Program Manual =head1 DESCRIPTION Program documention in the body of the document. =head1 AUTHOR Anthony Thyssen 3 December 2002 A.Thyssen_AT_griffith.edu.au =cut use strict; use FindBin; my $PROGNAME = $FindBin::Script; # Perl Pod Usage Method sub Usage { eval { use Pod::Usage; pod2usage ( @_ ); }; } #------------------ #Argument handling # # This method allows you to have command line options in the forms... # GNU long options: --help --debug # Multi-switch options: -dvr # Option arguments either: -nNAME OR -n NAME # Optional option argument: -i.bak OR -i # tar-like option args: -fbs filename blocks skip # Sort like numerical ranges: -k20,30 # # Developed from the Perl Camel Book v3, page 122 # Programmers Note: # Within the inner option block... # "next" is equivelent to a "next OPTION" # "last" is equivelent to a "last OPTION" # "redo" means look for more multi-switch options (inner block) # OPTION: # Multi-switch option handling while( @ARGV && $ARGV[0] =~ s/^-(?=.)// ) { $_ = shift; { m/^$/ && do { next }; # Next option m/^-$/ && do { last }; # End of options '--' m/^\?/ && do { Usage }; # Usage Help '-?' m/^-help$/ && Usage( -verbose => 1); # quick help (synopsis) m/^-manual$/ && Usage( -verbose => 2); # inline manual m/^-debug$/ && do { $debug++; next }; # whole --word switches s/^d// && do { $debug++; redo }; # \ s/^f// && do { $force++; redo }; # > multi-switch options s/^v// && do { $verbose++; redo }; # / s/^n// && do { $name = $_ || shift; next }; # "-nARG" OR "-n ARG" s/^l// && do { $level = length() ? $_ : shift; next }; # "-l0" OR "-l 0" s/^i// && do { $back = $_ || ".bak"; next }; # "-i.sfx" OR just "-i" s/^k(\d+)// && do { $start = $1; redo }; # Sort like numerical ranges s/^,(\d+)// && do { $end = $1; redo }; # EG: -k10,20 s/^f// && do { $file = shift; redo }; # Tar-like option arguments s/^b// && do { $blocks = shift; redo }; # Eg: s/^s// && do { $skip = shift; redo }; # -fbs filename blocks skip Usage( "$PROGNAME: Unknown Option \"-$_\"" ); } continue { next OPTION }; last OPTION; } Usage( "$PROGNAME: Too Few Arguments" ) unless @ARGV; while( <> ) { ... } ------------------------------------------------------------------------------- Alternative argument handling (old method) This is the older method of handling command line swicthes. ALl options must be separate command line arguments... EG: tar -x -v -f filename OR tar -x -v -ffilename # The `and' below is to prevent a undefined variable warning. # It is also lower precidence so parentheses are not needed for assignment. while( $_ = $ARGV[0] and ($_, my $arg) = /^-(.)(.*)/ ) { $arg = '' unless defined $arg; # Ensure $arg is defined shift; /-/ && do { last }; # End of options /c/ && do { $pass = 1; next }; # Flag Option /n/ && do { $user = $arg || shift; next }; # Option Argument /T/ && do { # Special option $arg = $arg || shift; 'opt' eq $arg && do { $option = 1; next }; &Usage( "Bad Special -T Option `", $arg, "'\n" ); } /Z/ && do { # List of Special Options for $arg ( split(/;/, $arg || shift) ) { ($_,$value) = split(/=/, $arg, 2); $value = '' unless defined $value; /^binary$/ && do { $literal = 1; next }; /^log$/ && do { $log_file = $value; next }; # &Usage( "Bad Special -Z Option `", $arg, "'\n" ); } }; # Any unknown options -- generalised could get it wrong! # shift unless $arg || ARGV[0] =~ /^-/; &Usage( "Unknown or Bad Option \"-$_$arg\"\n" ); } ------------------------------------------------------------------------------- Indented HERE Here here Document Shell cut and paste documents (very simple) echo ' |Line One | Line Two | Line Three |Line Four ' | perl -ne 's/^\s*\|// && print' > t NOTE the above is done this way otherwise we get differences with the EOL being either newline or return depending on how it was cut & pasted! In Perl Scripts (using a herefile() subroutine) (my $prog = $0) =~ s/^.*\///; sub Usage { die @_, herefile(" | Usage: $prog [-options] file... | -d Output extra debugging information # -e Obsolete Option, do not putput to the user "); } print herefile( <<'EOF' ); | yes the indent to the left is removed, | and the type of indent can changed to suit data # You can even add comment lines into the here file! | you can print this # but don't print this EOF # Remove the indent of a here file... # And strip any # comments found in the here file # Adjust to suit you here file requirements sub herefile { my $string = shift; $string =~ s/^\s+#.*\n//gm; # completely remove full line comments $string =~ s/#.*//g; # remove end-of-line comments $string =~ s/^\s+\| ?//gm; # remove the indent part of the line $string =~ s/\s+$/\n/g; # remove any extra end-of-line spaces return $string; } General purpose here-file fixer... It looks to see whether each line begins with a common substring ($leader), and if so, strips that substring off. Otherwise, it takes the amount of leading whitespace found on the first line and removes that much off each subsequent line. However it does not allow the use of comments in here files! sub herefixer { local $_ = shift; my ($white, $leader); # common whitespace and common leading string if (/^\s*(?:([^\w\s]+)(\s*).*\n)(?:\s*\1\2?.*\n)+$/) { ($white, $leader) = ($2, quotemeta($1)); } else { ($white, $leader) = (/^(\s+)/, ''); } s/^\s*?$leader(?:$white)?//gm; return $_; } ------------------------------------------------------------------------------- Legal tricks # tricky splits (only is 'g' flag is present!) ($num) = /\d+/g; # just the matched string ('g' needed) ($num) = /(\d+)/; # just the matched string ('()' needed) ($a, $b, $c) = "123 4 56" =~ /\d+/g; # three matched strings ($b) = "123 4 56" =~ /\d+ (\d+) \d+/; # just the middle number ($a, $b, $c) = "123 4 56" =~ /(\d)+/g; # the last digit in each number! ($a, $b, $c) = /(\w+) (\w+) (\w+)/; # three words only ($before, $a, $b, $c, $after) = split(/(\w+) (\w+) (\w+)/, $_, 2); # Auto flush STDOUT and STDERR select((select(STDOUT), $| = 1)[$[]); select((select(STDERR), $| = 1)[$[]); # Assigned if variable is not true $option ||= "default_value"; # Assign from cache if posible, otherwise look it up! $uid = $user{$user} ||= getpwnam($user); # make a backup of all the listed files perl -p -i.bak -e '' # Array to Hash (for 'value exists' lookup of array elements) %hash = @array; # array into a hash undefined values. @hash{@array} = (1) x @array; %hash = map { $_, 1 } @array; # Copy and Modify an Array (which version is better?) # Remember normally "for" or "map" will modify the actual array given!!! for( @new = @old ) { s/bad/good/g }; map { s/bad/good/g } ( @new = @old ); # Constant Variables (read only) $PI = 3; # This will Fail! $PI++; # That is, this would work when it shouldn't *PI = \3.1415927; # This won't print "PI = $PI\n"; # subroutine constant sub PI() { 3.1415927 } # approximaton sub PI() { 4 * atan2(1,1) } # will be re-calculated multiple times!!!! print "PI = ", PI, "\n"; # This will calculate only once! but works like a subroutine constant use constant PI => 4 * atan2(1,1); # calculated ONCE only Binary Patch -- set netscape binary for strong encryption #!/usr/bin/perl -0777pi s/TS:.*?\0/$_=$&;y,a-z, ,;s, $,true,gm;s, 512,2048,;$_/es RE grep matching against an array of values given @words = gw(alt1 alt2 alt3); Slowly check each word, one word at a time if ( grep { /^$word$/ } @words ) { ... fi Or use RE alturnatives (assuming words are well defined) $" = '|'; if ( $word ~= /^(@words)$/ ) { ... } WARNING $host = "machine.localdomain."; $host =~ /\Q$host\E\b/; Will NOT match as $host does not end in an alphanumeric (\w) character. That is because \b only works NEXT to a \w character!!!! In this case $host ends in '.' which is not a word boundary at the end of the string. Big NO NO -- Using local in looping block. for ( 1 .. 100 ) { solution local(@array) local(@array); ===> for ( 1 .. 100 ) { ... undef @array; ... ... } } The above will cause you to have 100 @arrays before the end of the loop. Alturnative use "my" instead of local. ------------------------------------------------------------------------------- Regular Expressions.... Debuging, and seeing exactly what perl is doing... see "Programming Perl v3" p213 (using embeded prints) and p195 for RE debuging Expanding Tabs (simply) 1 while s/\t+/' ' x (length($&) * 8 - length($`) % 8)/e; Compress multiple blank lines to one blank line (This is not easy!) perl -ne 'if (/\S/) { print; $i=0 } else {print unless $i; $i=1; }' Or inverting the search (print paragraphs) perl -ne 'print if /\S/../^\s*$/' NOTE: simplier methods exist using sed, and vim macros see file "shell/script.hints" Remove C Comments (using minimal RE expandsion - perl v5) $program =~ s{ /\* .*? \*/ }[]gsx; Remove surounding spaces in one RE The '/g' is needed to match twice, it is slower than separate REs s/^\s+│\s+$//g; Removing extra spaces (multi-line) This removes all spaces at the start and end of lines in a multi-line string. The final newline will be removed, as will all blank lines, but as '$' is a zero length match before a newline, the end of line newlines will not be removed (just the last one). $string =~ s/^\s+│\s+$//gm; Lowercase and capitalise a title/author line s/(? $a = 'c' # Note comma operator ($b) = A(); # ($b) = ('a', 'b', 'c') => $b = 'a' $c = B(); # $c = @array => $c = 3 ($d) = B(); # ($d) = @array => $d = 'a' $e = C(); # $e = 'c' # C() behaves like A()!!! ($f) = C(); # $f = 'a' # D() will also act like A()!!! Of course a function can ask what context it is in... This is of course equivenelt to A(), C() and D() sub E { my @array = ('a', 'b', 'c'); return wantarray ? @array : $array[0]; } ------------------------------------------------------------------------------- Word finding optimizations On a large string (like a whole file) Where $_ = "whole_file_of_words" Comparing hash keys is faster than a RE of the words EG: &word{ qw( and or then last next ) } @file = grep { defined $word{lc($_) } split; is 3 times faster than $word = "(and|or|then|last|next|........)" s/ $word / /ig; Using minimal matching s/\b(\w*?aaa\w*?) / /ig; is faster than normal longest matching s/\b(\w*aaa\w*) / /ig; for very short matches on very long strings (whole files in memory) Extreme care is advised on the sub-strings. Word boundary is slower than a plain space s/ (and|or|then|last|next) / /ig; is faster than s/\b(and|or|then|last|next) //ig; due to complexity of the match, BUT the /g does NOT work properly without the \b (it will skip words)!! ------------------------------------------------------------------------------- Remove duplicates from an array If you don't care but the order, convert the array into a hash.. Making each element a key. my %hash = map { $_, 1 } @array; my @unique = keys %hash; %hash = (); A hash & array can be used to only accept first seen, store info about the element in the hash, while the array keeps track of the key order. A Module solution is uniq() from List::MoreUtils In perl 5.010 you can use 'smart match' operator print "The array contains $item" if $item ~~ @array; print "The hash contains $item" if $item ~~ @hash; ------------------------------------------------------------------------------- Setting signals NOTE: While perl END {} blocks and module DESTROY {} blocks are called by perl when the program exits, they are NOT called when the program is interupted!!! If this is required, then you need to set up an interupt handler to actually call the "exit()" funtion. This can be don in a number of ways.... use sigtrap(die normal-signals) OR... sub set_signal { my($func) = @_; $SIG{'INT'} = $func; $SIG{'QUIT'} = $func; $SIG{'HUP'} = $func; $SIG{'TERM'} = $func; } sub Interputed { print STDERR "Interupted, doing cleanup\n"; system("rm -f ".mail_dir."/*$new"); exit 10; } set_signal( \&Interupted ); OR... # # Interupt handler for Critical Sections # # We don't want to be left with a incomplete data file. So we only note that a # interupt occured, but return to the task at hand afterward. We then exit # when it is safe to do so. Critical Sections should be kept as small as # posible, and enter and exit frequently. Also watch for any posible blocking # actions performed during the critical period. # my $interupted = 0; sub interupt { $interupted = 1; } # We were interupted? sub set_sig_handler { # turn handler on/off my( $turn_on ) = @_; die "INTERUPT CAUGHT DURING CRITICAL PERIOD -- EXITING NOW\n" if $interupted; $SIG{'HUP'} = $SIG{'QUIT'} = $SIG{'INT'} = $SIG{'TERM'} = $turn_on ? 'interupt' : 'DEFAULT'; } set_sig_handler(1); # ... critical section ... # ... clean up ... set_sig_handler(0); ------------------------------------------------------------------------------- Automatic SU if ( $> == 0 ) { print "$prog : You must be user $db_user \n" ; print "-- performing automatic su \n" ; { exec 'su', '-',$db_user,'-c', "'".join("' '","$db_home/bin/$prog",@ARGV)."'"; } print STDERR "Unable to su to $db_user -- exiting \n"; exit 0 ; }elsif ( (getpwnam($db_user))[2] != $> ) { # im not duty -- BAD print STDERR "You must be $db_user to run this script \n" ; exit 0 ; } ------------------------------------------------------------------------------- Auto Background (perl 5) exit 0 if fork; # basic background use POSIX qw(setsid); setsid(); # disassociate from terminal etc. Extra Discussion... Should he also run setgid(), and then either close filehandles 1-3 (stdin, stdout, stderr)? But we are not trying to turn the whole thing into a 'daemon' - we are just trying to "background" it so that shell returns a prompt. As such the 'setsid' is just a way to avoid having shell's SIGSTOP etc. get in the way. Truely disassociated GUI apps are very rare - most will die horribly when window manager exits when user logs out. ------------------------------------------------------------------------------- Date and Time conversion in perl time --> date string #require "ctime.pl"; # perl 4 method (newline in ctime()) use Time::localtime; $time = 999523563; $date = &ctime($time); $date =~ s/\s*\b[A-Z]{3}\b\s*/ /; # remove timezone and any linefeed print $date, "\n"; time --> formated date $time = 999523563; @d = localtime($time); # time as a 9 element array $d[4] ++; # adjust month $d[5] += 1900; # adjust year to 4 digits $date = sprintf( "%04d-%02d-%02d %02d:%02d:%02d", reverse @d[0..5] ); date --> time #repuire "timelocal.pl"; # perl 4 use Time::Local; @d = ( 18, 9, 1998 ); # date = 18 September 1998 $d[1] --; # adjust month to 0-11 $d[2] -= 1900 if $d[2] > 1900; # adjust year 2001 -> 101 $time = timelocal( 0,0,0, @d ); # midnight on that day time delta --> human readable string my($days,$hours,$mins,$secs); $secs = time - $^T; # Time since the program started $days = int ($secs/86400); $secs %= 86400; $hours = int ($secs/3600); $secs %= 3600; $mins = int ($secs/60); $secs %= 60; print "Program has been running for... ", $days > 0 ? "$days days $hours:$mins" : $hours > 0 ? "$hours hrs $mins mins" : $mins > 0 ? "$mins mins $secs secs" : "only $secs seconds" , "\n"; Date Parse Module use Date::Parse; my dt = str2time('25/02/1990 23:48:00'); $dt += 6 * 60; Date Manipulation module See http://search.cpan.org/~sbeck/DateManip-5.44/Manip.pod use Date::Manip; $date = ParseDate("25/02/1990 23:48:00"); $delta = ParseDateDelta("+ 6 minutes"); $new = DateCalc($date,$delta); # or alternate/shorthand: # $new =DateCalc("25/02/1990 23:48:00","+ 6 minutes"); # then output however you like. print &UnixDate($new,"It is now %T on %b %e, %Y."); ------------------------------------------------------------------------------- Progress Reporting.... # --- by data read from file --- $PROGRESS = 1; # Clear Progress report lines $B = `tput el`; # terminfo: clear to end of line #my $B = `tput ed`; # terminfo: clear to end of display #my $B = `tput dl 1`; # terminfo: delete line #my $B = (" "x(`tput cols`||80) . "\r"; # blank spaces -- fallback #my $B = (" "x80) . "\r"; #print "-"x100, "\r", $B, "x\n"; exit; # DEBUG print STDERR "Main Processing Loop...\n" if PROGRESS; # process start time my $start_time = time; # Setup progress report my $progress = ''; # clear progress line my $progress_total = -s $data; # data file size my( $progress_done, $progress_last ) = (0,0); open(DATA, "$data") || die("Unable to read \"$data\" : $!\n"); while( ) { $progress_done += length if $PROGRESS; #... # to output some info, clear progress info, output, restore progress print STDERR "${B}" if $PROGRESS; print "report some info\n"; print STDERR $progress if $PROGRESS; #... } continue { if( $PROGRESS ) { if ( $last_time != time ) { # update once a second (optional) $last_time = time; $progress = progress($progress_total,$progress_done,$start_time); print STDERR $B," Working...", $progress, "\r"; } } } print STDERR $B if $PROGRESS; warn("Assertion Failure: Progress count ($progress_total) ". "does not equal progress target ($progress_done)!") unless $progress_total == $progress_done; my $process_time = time_english(time - $start_time); printf "Processed %d lines in 4process_time\n", $.; # ..... # Generate the progress report -- for main loop only # Progress is on the numbers given to this sub-routine sub progress { my ($tot,$curr,$stime) = @_; return '' unless $tot && $curr; $curr /= $tot; # how much of the run have we completed my $run = (time - $stime); # time we have been running $tot = $run / $curr; # total time of run start to finish my $left = $tot - $run; # time left to finish if ( $tot > 86400*2.5 ) { # multi-day report # status report in days/hours/minutes return sprintf( "%4.1f%% / %dd%02d:%02d => %dd%02d:%02d + %dd%02d:%02d", 100*$curr, $tot /86400, $tot /3600%24, $tot /60%60, $run /86400, $run /3600%24, $run /60%60, $left /86400, $left /3600%24, $left /60%60 ); } elsif ( $tot > 2400 ) { # more then 40 minutes, less 2.5 days # status report in hours/minutes return sprintf( "%4.1f%% / %d:%02d => %d:%02d + %d:%02d", 100*$curr, $tot /3600, $tot /60%60 ); # $tot %60, $run /3600, $run /60%60, # $run %60, $left /3600, $left /60%60, # $left %60 ); } else { # status report in minutes/secones return sprintf( "%4.1f%% %d:%02d + %d:%02d => %d:%02d", 100*$curr, $tot /60, $tot %60, $run /60, $run %60, $left /60, $left %60 ); } } # convert a time in seconds to more human readble string sub time_english { my ($secs) = @_; my $days = int ($secs/86400); $secs %= 86400; my $hours = int ($secs/3600); $secs %= 3600; my $mins = int ($secs/60); $secs %= 60; return $days > 0 ? "$days days $hours:" . sprintf("%02d",$mins) : $hours > 0 ? "$hours hrs $mins mins" : $mins > 0 ? "$mins mins $secs secs" : "only $secs seconds"; } ------------------------------------------------------------------------------- Sort methods and techniques Case insenitive sorting sub case_insensitive { "\U$a" cmp "\U$b"; } Numerically sub numeric { $a <=> $b; } Sort Associated array by value not key sub byvalue { $value{$a} <=> $value{$b} } foreach key ( sort byvalue keys %value ) { BODY; } Sort by value then by by key sub val_key { $second{$a} <=> $second{$b} || $a cmp $b } Sort a hierarchal (dot) naming scheme -- Marc Horowitz IE: paths, newsgroups... sub depthfirst { $aa = $a."/~"; $aa =~ s|/|!|og; $bb = $b."/~"; $bb =~ s|/|!|og; $aa cmp $bb; } Print associative array by value (quickly) This will create a plain array with the value before the name then print the sorted list output. This is very fast. $mask = "%04d %s"; for (@arr) { push (@idx, sprintf($mask, (/^\s+\((\d+)\)\s+(\S+)/))) } print @arr[ sort {$idx[$a] cmp $idx[$b]} 0 .. $#idx ]; undef @idx NOTE: in the above that you CAN sort without a function (directly) Sorting by a computable field. The problem with sorting with a computable field is that you could end up computing the field at least 2 or more times in a sort function! The following using a map to extract and pre-compute the field, then sorts by that filed, and finally re-maps the original un-computed field. This is known as a "Schwartzian Transform" as it was popularised by Randel Schwartz. NOTE you start at the bottom line and work your way up. @sorted_fields = map { $_->[1] } sort { $a->[0] <=> $b->[0] } map { [ /(\d+)/, $_ ] } @fields; Another example with a multi-field password file... print map { $_->[0] } # print the original line sort { $a->[1] <=> $b->[1] # first by gid || $a->[2] <=> $b->[2] # then by uid || $a->[3] cmp $b->[3] # and by login (should not be needed) } map { [ $_, (split /:/)[3,2,0] ] } # array: line then sort fields `cat /etc/passwd`; # read password file You can read his own notes on this 'transform' in UnixReview Column 64 (May 2006) http://www.stonehenge.com/merlyn/UnixReview/col64.html ------------------------------------------------------------------------------- Micro sleep. One method is to use the select() timeout select(undef,undef,undef,.01); You could also set an alarm... (see perl/functions/alarm.pl) ------------------------------------------------------------------------------- Record Reading and separation Reading records as Lines, Paragraphs, or whole Files are relativally easy. But using a string as a record separator is harder and comes in three styles. Inter-record separator, to be ignored (commas for example) This is also relativeally easy, just delete the marker from the input if it was found. However when writing you need to output the separator at the start of each record, except for the first record. Otherwise you may add a empty record on the end of the output. END record separator... Perl handles this by default, for its handling of end-of-line. and blocks. $/ = 'end-of-record-marker' BEGIN record Separator... This is tricky. First it you are not carful you may get a 'blank' record at the start of the file (file comments?) which needs to be ignored. The separator also needs to be re-added, and a copy of it removed from the end. Finally the last record will not have separator to remove. $/ = 'start-of-record-marker' For example... records start with line startin with "Window " xlsclients -l | perl -e ' $BEGIN = "Start "; local $/ = "\n$BEGIN"; while(<>) { s/$BEGIN\Z//; # clean up end of record s/\A/$BEGIN/ unless $.==1; # clean up start of record print "--- $. ---\n$_"; } print "--- Total $. Windows ---\n"; ' This gets worse if the separator is one of a number of different strings (record types?). In that case you need to keep a copy of it so you can prepend it to to the next record. ------------------------------------------------------------------------------- Record/Line continuation. Three basic styles... A '\' at the end of a line. Relativeally easy, remove the '\' and just append the next 'record' A 'end of record' marker is not present. So again read more until a EOF or marker is found. No White Space at line start means start of next record. This is harder as you will need to 'pre-read' the next record, and hold it for the next loop. also at the end you need to still handle that record. {code example here} ------------------------------------------------------------------------------- Match Delimited Text |> The input file has a declaration something like the following with |> several comments in a single line: |> |> input a, b, /* comment */ c, /* comment ******************* */ d; |> |> I need to delete the comments in between and write the declaration as |> |> input a, b, c, d; |> |> NOTE: the c, must not be deleted. It's a standard match-delimited-text problem, and the general solution is: 1: match the opening delimiter 2: match stuff that's not the closing delimiter 3: match the closing delimiter In this case, the opening delimiter is "/*" so the regex is "/\*". The closing delimiter is "*/", so that regex is "\*/". Stuff that's not the closing delimiter would be A) anything that's not / (regex "[^/]" ) and B) any / so long as it has no * before it (regex "[^*]/") Combining them with an indication to say "as much as is there", we get: ([^/]|[^*]/)* So the whole regex, wrapped in some perl, would be: s#/\*([^/]|[^*]/)*\*/##g; part number from above: 1112222222222222333 Note that there's another way to conceptually look at the "stuff not the closing delimiter". That'd be: A) anything not a * (regex "[^*]") and B) any * so long as it's not followed by a / (regex "\*[^/]") That would lead to s#/\*([^*]|\*[^/])*\*/##g; However, since the "\*[^/]" eats a character, it could eat the third * in the string "/* commet **/" and we'd wedgie the regex and it wouldn't match. The first way described above only eats characters we've already had a chance to check aren't the ending, so it won't wedgie. Jeffrey Friedl ------------------------------------------------------------------------------- Spliting Quote Delimited Fields EG: spilting CVS data (a comma delimited file with quoted fields) Example: SAR001,"","Cimetrix, Inc","Bob Smith","CAM","\"",N,8,,"Error, Core Dumped" undef @fields; push( @fields, defined($1) ? $1 : $3) while m/"([^"\\]*(\\.[^"\\]*)*)"|([^,]+)/g; WARNING the above does not seem to work under perl 5 -- Anthony Jeffrey Friedl, author of Mastering Regular Expressions gives... @new = (); push(@new, $+) while $text =~ m{ "([^\"\\]*(?:\\.[^\"\\]*)*)",? # groups the phrase inside the quotes | ([^,]+),? | , }gx; push(@new, undef) if substr($text,-1,1) eq ','; However quotes within a quoted field needs to be backslashed using \" Alternatively, use Text::ParseWords use Text::ParseWords; @new = quotewords(",", 0, $text); For space separated words such as for a shell command EG: cp -p "my file" "yourfile" you can look at... perl4: shellwords.pl library, perl5: Text::ParseWords module perl -de 1 use Text::ParseWords $line = 'cp -p "my file" "your file"' @words = shellwords $line X words @words = ( 0 'cp' 1 '-p' 2 'my file' 3 'your file' ) Perl 4 Alturnatives... Method 1: # delimit ',' with quoted strings and variable allowed $_ = 'f1,f 2,"f3","f,4",5,$time,f7'; while (/,|"|$/go) { ($within = ($within ? 0 : 1), next) if '"' eq $&; next if $within; substr($_, 0, length($`)+1) = ""; push(@fields, $`); } print join(" ", @fields),"\n"; output f1 f 2 "f3" "f,4" 5 $time f7 Method 2: Just remove the delimiter ',' from within quotes s/("[^"]*")/do{$a = $1; $a =~ tr#,#c#; $a;}/ge; now you can split the line as you would normally If you substitute the ',' with a unused char you can restore it later! ------------------------------------------------------------------------------- Sub-Record in a Record. The 'leading' record separator not only separators individual records but also sub-records within records. And example, this is the Firefox v3 'JSON' bookbarks file! {N:"dir",[{N:f1},{N:subdir,[{N:f2}]},{N;f3}]} [...] sub-directory {...} record And we want to output DIR:dir Item:f1 DIR:subdir Item:f2 END Item:f3 END This was implemented in "firefox_hotlist" with the following structure $/='{'; $level=0 while( <> ) { ( $name ) = /N:"(.*?[^\\])",/ || /N:(.*?),/; if ( /\[/ ) { # output directory type print " "x$level, "DIR:$name\n"; $level++; next; } # output normal record print " "x$level, "Item:$name\n"; # exit sub-directories if ( my (@count) = /]}/g ) { for ( @count ) { $level--; print " "x$level, "END\n"; } } } NOTE the above assumes the characters "{", "[" and "]}" are unique. and sub-records are always at the end of the directory record. with no other feilds for the directory record afterward. In other words it is a hack, but does not require a directory record to be held in memory until it is complete. -- A Simplification ------------------------------------------------------------------------------- Random selections from an array (shuffle) # create array of numbers to shuffle my($i, @number ); for( $i=1; $i<=$NUMBER; $i++ ) { push(@number, $i); } print "number list = ", join(",", @number), "\n"; # create the randomized array by removing elements from number list srand($$^time); # randomize random number generator (if desired) while( @number > 0 ) { # while we have a number to be picked push(@randomized, splice(@number, rand(@number), 1) ); # random pick } print "random list = ", join(",", @randomized), "\n"; # shuffle array in place... (Perl Cookbook Recipe 4.17) # NOTE: requires a real array argument due to prototyping sub fisher_yates_shuffle(\@) { my $array = shift; my $i; for ($i = @$array; --$i; ) { my $j = int rand ($i+1); next if $i == $j; @$array[$i,$j] = @$array[$j,$i]; } } fisher_yates_shuffle @array; ------------------------------------------------------------------------------- Incrementing a string using your own rules Perl's auto-increment of strings is limited to specific strings This method defines your own. inc($) increase a single character EG: 3 -> 4 roll($) roll a string a chars EG: 9999 -> 0000 while( <> ) { s/(.*)([0..8])([9]*)$/ $1 . inc($2) . roll($3) /e; print "$_\n"; } The first (.*) makes the RE work faster by ignoring start chars WARNING: 99999 will not increment but 099999 will ------------------------------------------------------------------------------- Format handling You can turn off page breaks the same way it does internally when it notices the lack of a top-of-form format. Just set $- to a huge number. HOWEVER this will result in the top of form NEVER being printed. Correct way is to let the first write happen then assign $- so it can never reach zero again. select(FILEHANDLE); foreach i ( @array ) { ....; write; # write top-of-form and the other lines $- = 99; # form always has 99 lines left - never end page } ------------------------------------------------------------------------------- Outputing elements in columns Example 5 rows of data. Note the newline statement is BEFORE the element printing { print "Title: " my( $i, $s ) = ( ' 'x8, ' 'x2 ); # indent and seperator my( $c, $e ) = ( 5, -1 ); # columns per row, elements left foreach ( @elements ) { print "$s" if $e > 0; print "\n$i" if $e == 0; $e = $c if $e <= 0; $e--; print "%10s", $_; } print "\n"; } In many cases this general loop can be simplified. For instance, if indent can be output on the first line as well (that is $c is never -ve), then the '$c=$r' can be merged onto the line above. If seperator is not needed or always out put then it can be simplified further.... { print "Title:" # no return from previous line my( $c, $e ) = (5); # this many elements per row foreach ( @elements ) { print("\n "),$e=$c unless $e; print " %10s", $_; $e--; } print "\n"; } The string constants of course can be substituted. ------------------------------------------------------------------------------- tr and variables problem The tr command will not accept variables, the following is a hack to allow this. This does not solve the delimiter problem however. eval "\$string =~ tr/$chars/$replacement_chars/"; ------------------------------------------------------------------------------- convert a bit vector into a list of intergers $low = -1; $high = -1; $range_cnt = 0; $printed = 0; for $i (0..($bitmap_size-1)) { if (vec($bitmap_ptr, $i, 1) == 1) { if ($low+$range_cnt == $i) { $range_cnt++; } elsif ($range_cnt > 2) { print "..", ($low+$range_cnt-1), ", $i"; $range_cnt = 1; $low = $i; } elsif ($range_cnt == 2) { print ", " if $printed; print $low+1, ", $i"; $printed = 1; $low = $i; $range_cnt = 1; } else { print ", " if $printed; print "$i"; $printed = 1; $low = $i; $range_cnt = 1; } $high = $i; } } if ($high != $low) { if ($range_cnt > 2) { print ".."; } elsif ($range_cnt == 2) { print ", " if $printed; } print "$high"; } print "\n"; dgross@rchland.vnet.ibm.com (Dave Gross) ------------------------------------------------------------------------------- Indirect function calls -- function ptrs sub foo() { print "foo( ", join(", ", @_), " )\n"; } $function = "foo"; # function expression &$function( "arg1", "arg2" ); # indirect call NOTE in version 4 $function can NOT be replaced with an expression though it can in version 5 ------------------------------------------------------------------------------- System call return checks Beware the $! is not reset by the call to system. To be on the safe side you should do: $! = 0; system('foo'); die "$0: foo: $!\n" if $!; Note that this only works if 'foo' is run without using /bin/sh. If /bin/sh is used to run the command then sh prints a message to stderr, $! will not be set, and $? >> 8 is set to one. ------------------------------------------------------------------------------- Set System Limits in perl... =======8<-------- require 'syscall.ph'; require 'sys/resource.ph'; # note h2ph doesn't always win on this one # -- hand editing may be necessary # Arrange so no core files are generated $coresize = pack("i2",0,0); syscall(&SYS_setrlimit, &RLIMIT_CORE, $coresize); # Make stack size large $stacksize = pack("i2",1024*1024*4,1024*1024*4); syscall(&SYS_setrlimit, &RLIMIT_STACK, $stacksize); =======8<-------- ------------------------------------------------------------------------------- User Accounts and perl... The following is dependant on the nsswitch and Solaris systems... getpwent() and shadow password as root The getpwent() function will return the users password to root IF * users password in located in "/etc/passwd" -- fat chance * user is listed in the /etc/shadow file and perl version is >5.005_57 * it was called on the NIS+ server && user is in the NIS+ password file and you are authorized to see that password. Only in these cases will the getpwent() perl function return the users encrypted login password. This is a real pain. Especially as perl does not have access to the system librarys shadow database getsd* functions. getpwnam(user) The getpwnam(user) will never return the users password, but will let you know if this user is actually a valid login user of this machine. EG: user is in /etc/passwd or the appropriate netgroups access to the NIS+ Login Group restrictions... A user which can not login due to some login groups restrictions (EG: NetGroups under NIS, or a LDAP authenticated login group) will NOT be listed by ANY of the getpw* functions. In other words a user which was disabled due to group access, may not be listed, dependant on the nsswitch settings (EG: "compact" setting or LDAP authenticated search restrictions) The alturnative is to always list all potential users (via nss), even if they are NOT in the right login group for this machine. Then if nessary, reject those with denied login by group access manually yourself. This however mean that unless your perl script is "pam" smart, it can NOT determine if a user still present in the password database, but is denied login access (via group), should have their home cleaned up and deleted without some other external indication of the users "final deletion". ------------------------------------------------------------------------------- Randomise srand() Randomising the srand function can be very difficult. The seed may not change quickly, or never change in a programs life time, same have only have a limited number of start seeds (0 to 60,000 for process ids). And combineing them may still be limited. Quick (limited) choices... * Current time (won't change within same `tick' time() * Using the process id of the current process $$ * Time and process id time() ^ $$ * Using the process id of a sub-shell (always different) `echo \$\$` Randomise on gziped process list ( from apatche 1.3.3 dbmmanage) sub randomise { my $psf; for (qw(-xlwwa -le)) { `ps $_ 2>/dev/null`; $psf = $_, last unless $?; } srand (unpack("%L*", `ps $psf | gzip -f`)); } Alternatively Use the `cksum' on the same source. this is faster but may not be availble on all systems. sub randomise { my $psf; for (qw(-xlwwa -le)) { `ps $_ 2>/dev/null`; # test option $psf = $_, last unless $?; } srand ( `ps $psf | cksum` ); } WARNING: process list generation could be slow! Especially on a machine using a remote password list (like LDAP or NIS+), and network problems hit. Another major problem is that the ps options vary from machine to machine which results in the need for option checking making it even slower. Also output does not change much as some parts giz archive start is a constant. Also note that perl5.005 and later automatically randomises the random number generator Albert Cahalan suggests you use the following list of "ps" options. -le -xlwwa (putting POSIX-standard -le first) -elwwa (valid for both BSD and UNIX) xlwwa -le (removing the initial "-" kills a warning on some systems) ------------------------------------------------------------------------------- Password Encryption In perl Specifically the `salt' key encryption generally you would do something like... Initialization... @salt_set = ('a'..'z', 'A'..'Z', '0'..'9', '.', '/'); $salt_size = scalar @salt_set; # should be 64 characters! Method 1... # From Example in Perl 4 Camel Book # The salt for today is seleted by the traditional method sub gen_salt { my($passwd) = @_; my($perturb1,$perturb2,$week); # perturb the salt with start of input passwd ($perturb1,$perturb2) = unpack("C2", $passwd); $week = time() / (60*60*24*7) + $perturb1 + $perturb2; return( $salt_set[ $week % $salt_size ] . $salt_set[ time() % $salt_size ] ); } crypt( $passwd, gen_salt($passwd) ); Method 2... # Extracted from dbmmanage in Apatche 1.3.3 distribution # randomise the salt for all strings. sub gen_salt { join('', map($salt_set[rand $salt_size], 1..2) ); } crypt( $passwd, gen_salt() ); Other Techniques... Generate a random password from $logname # 8 character randomised passwd # method: encrypt the logname then grab LAST 8 chars $passwd = substr( crypt( $logname, gen_salt() ), -8, 8 ); # Substitute characters which could be misinterperted # EG: characters O0Q all look simular, and dot may be missed $passwd =~ tr|0OQ./+1Il^#;|XYZabc234rst|; ------------------------------------------------------------------------------- Vgrind entry for perl programs PERL|perl|Perl:\ :pb=^\d?(sub|package)\d\p\d:\ :bb={:be=}:cb=#:ce=$:sb=":se=\e":lb=':\ :le=\e':tl:\ :id=_:\ :kw=\ if for foreach unless until while continue else elsif \ do eval require \ die exit \ defined delete reset \ goto last redo next dump \ local undef return \ write format \ sub package NOTE: things like $#, $', s#/foo##, and $foo'bar confuse vgrind ------------------------------------------------------------------------------- Suid Vulnerability (v5.002) Suid Perlscripts using suidperl or sperl are insecure due to a race condition on some systems. The program does not relinquish its root privileges properly. Patch available or get and install 5.003 or a C wrapper can be used. ftp://coombs.anu.edu.au/pub/perl/src/fixsperl-0 -------------------------------------------------------------------------------