#!/usr/bin/env qore %enable-all-warnings %disable-warning undeclared-var # Examples of qore regular expressions # initially written and tested for qore 0.4.0 (Oct 2005) # by Helmut Wollmersdorfer # TODO: check against http://www.opengroup.org/onlinepubs/007908799/xbd/re.html # regex in qore is based on PCRE (perl5 regular expressions) # The following examples are in the order of regex(7) of Debian/Sarge # Comments beginning with '##' are quotes from regex(7) ## A (modern) RE is one(!) or more non-empty(!) branches, separated by '|'. ## It matches anything that matches one of the branches. $t = 'Branches'; # text $s = 'abc'; # string $p = 'a|z'; # pattern printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /a|z/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'qrs'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /a|z/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## A branch is one(!) or more pieces, concatenated. It matches a match ## for the first, followed by a match for the second, etc. $t = 'Pieces'; $s = 'abcxyz'; $p = 'bc'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /bc/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'bac'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /bc/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## A piece is an atom possibly followed by a single(!) '*', '+', '?', or bound. $t = 'Atoms and repeaters'; $s = 'abcxyz'; $p = 'ab*c+x?y{1}'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /ab*c+x?y{1}/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'bac'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /ab*c+x?y{1}/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## An atom followed by '*' matches a sequence of 0 or more matches of the atom. $t = 'None or more'; $s = 'abbbc'; $p = 'ab*c'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /ab*c/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'adc'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /ab*c/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## An atom followed by '+' matches a sequence of 1 or more matches of the atom. $t = 'One or more'; $s = 'abbbc'; $p = 'ab+c'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /ab+c/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'ac'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /ab+c/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## An atom followed by '?' matches a sequence of 0 or 1 matches of the atom. $t = 'None or one'; $s = 'abc'; $p = 'ab?c'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /ab?c/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'adc'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /ab?c/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## A bound is '{' followed by an unsigned decimal integer, possibly followed ## by ',' possibly followed by another unsigned decimal integer, ## always followed by '}'. $t = 'Bound'; $s = 'abcc'; $p = 'a{1}b{0,}c{2,3}'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /a{0}b{1,}c{2,3}/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'adc'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /a{0}b{1,}c{2,3}/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## The integers must lie between 0 and RE_DUP_MAX (255(!)) inclusive, ## and if there are two of them, the first may not exceed the second. $t = 'Bound integers'; $s = 'abcc'; $p = 'b{0,1}c{2,255}'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /b{0,1}c{2,255}/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abd'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /b{0,1}c{2,255}/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## An atom followed by a bound containing one integer ## i and no comma matches a sequence of exactly i matches of the atom. $t = 'Bound integer exactly'; $s = 'abccd'; $p = 'bc{2}d'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /bc{2}d/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abcccd'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /bc{2}d/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## An atom followed by a bound containing one integer i and a comma matches a ## sequence of i or more matches of the atom. $t = 'Bound integer or more'; $s = 'abccccc'; $p = 'c{2,}'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /c{2,}/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abcdc'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /c{2,}/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## An atom followed by a bound containing two integers i and j matches a sequence ## of i through j (inclusive) matches of the atom. $t = 'Bound integer through maximum'; $s = 'abccccd'; $p = 'bc{2,4}d'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /bc{2,4}d/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abcdbcccccd'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /bc{2,4}d/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## An atom is a regular expression enclosed in '()' (matching a match for ## the regular expression), $t = 'Enclosed regex'; $s = 'abc'; $p = '(b)'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /(b)/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'acd'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /(b)/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## an empty set of '()' (matching the null string)(!), $t = 'Enclosed empty'; $s = ''; $p = '^()$'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /^()$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'acd'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^()$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## a bracket expression (see below), ## '.' (matching any single character), $t = 'any single character'; $s = 'abc'; $p = 'a.c'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /a.c/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'acd'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /a.c/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## '^' (matching the null string at the beginning of a line), $t = 'beginning of line/string'; $s = 'abc'; $p = '^ab'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /^ab/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'acd'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^ab/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## '$' (matching the null string at the end of a line), $t = 'end of line/string'; $s = 'abc'; $p = 'bc$'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /bc$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'bcd'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /bc$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## a '\' followed by one of the characters '^.[$()|*+?{\' (matching that ## character taken as an ordinary character), $t = 'escaped special character'; $s = '^.[$()|*+?{\\'; $p = '\^\.\[\$\(\)\|\*\+\?\{\\'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /\^\.\[\$\(\)\|\*\+\?\{\\/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = '\^\.\[\$\(\)\|\*\+\?\{\\'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /\^\.\[\$\(\)\|\*\+\?\{\\/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); # This seems to be against POSIX ## a '\' followed by any other character(!) ## (matching that character taken as an ordinary character, as if the '\' ## had not been present(!)), # NOTE: escaping multi-byte characters does not work for some reason (\§) $t = 'escaped ordinary character'; $s = '!"%&=~#-_>'; # $p = '\a\A\!\"\§\%\&\=\~\#\-\_\>'; $p = '\!\"\%\&\=\~\#\-\_\>'; # -- digits do not work: REGEX-COMPILATION-ERROR: Invalid back reference # printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /\a\A\1\!\"\%\&\=\~\#\-\_\<\>\,\;/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); # -- with PCRE the paragraph sign seems to be a problem # | unhandled QORE System exception thrown at ./regex.q:177 # | REGEX-COMPILATION-ERROR: invalid UTF-8 string # | chained exception: # | unhandled QORE System exception thrown at ./regex.q:179 # | REGEX-COMPILATION-ERROR: invalid UTF-8 string # printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /\a\A\!\"\§\%\&\=\~\#\-\_\>/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); # printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /\a\A\!\"\§\%\&\=\~\#\-\_\>/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /\!\"\%\&\=\~\#\-\_\>/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = '\!\"\%\&\=\~\#\-\_\>'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /\!\"\%\&\=\~\#\-\_\>/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); # escaping '<,;}' with '\' has problems # NOTE: it would be against POSIX 2, actual behaviour is o.k. $t = 'escaped with problems'; $s = '<,;}'; $p = '\<\,\;\}'; #printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /\<\,\;\}/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = '\<\,\;\}'; #printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /\<\,\;\}/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## or a single character with no other significance (matching that character). $t = 'unescaped character'; $s = '<,;}'; $p = '<,;}'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /<,;}/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abc'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /<,;}/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## A '{' followed by a character other than a digit is an ordinary character, ## not the beginning of a bound(!). $t = 'unescaped character'; $s = 'a{b}'; $p = 'a{b}'; # NOTE: it would be against POSIX 2, actual behaviour is o.k. #REGEX-COMPILATION-ERROR: Invalid content of \{\} #REGEX-COMPILATION-ERROR: Unmatched \{ #printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /a{b/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abc'; #printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /a{b}/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## It is illegal to end an RE with '\'. ## A bracket expression is a list of characters enclosed in '[]'. It normally ## matches any single character from the list (but see below). $t = 'bracket expressions'; $s = 'abc'; $p = 'a[bB]c'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /a[bB]c/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'adc'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /a[bB]c/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## If the list begins with '^', it matches any single character (but see ## below) not from the rest of the list. $t = 'negated bracket list'; $s = 'abc'; $p = 'a[^d]c'; printf("%s %s: \"'%s' =~ /%s/\"\n", ( $s =~ /a[^d]c/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'adc'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /a[^d]c/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## If two characters in the list are separated by '-', this is shorthand for ## the full range of characters between those two (inclusive) in the collating ## sequence, e.g. '[0-9]' in ASCII matches any decimal digit. $t = 'char range'; $s = 'abc'; $p = '^[a-z]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[a-z]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'ABC'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[a-z]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## It is illegal(!) for two ranges to share an endpoint, e.g. 'a-c-e'. Ranges are ## very collating sequence-dependent, and portable programs should avoid relying on them. ## To include a literal ']' in the list, make it the first character (following ## a possible '^'). $t = 'literal bracket'; $s = 'abc]'; $p = '^[]a-c]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[]a-c]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abc['; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[]a-c]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## To include a literal '-', make it the first $t = 'literal hyphen first'; $s = 'abc-'; $p = '^[-a-c]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[-a-c]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abc['; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[-a-c]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## or last character, $t = 'literal hyphen last'; $s = 'abc-'; $p = '^[a-c-]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[a-c-]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abc['; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[a-c-]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## or the second endpoint of a range. $t = 'hyphen range endpoint'; $s = '!#-'; $p = '^[!--]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[!--]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abc['; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[!--]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## To use a literal '-' as the first endpoint of a range, enclose it in '[.' and '.]' ## to make it a collating element (see below). $t = 'hyphen range startpoint'; $s = 'abc-'; $p = '^[--c]+$'; # this is POSIX # printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[[.-.]-c]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); # this is PCRE printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[--c]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abc!'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[--c]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## With the exception of these and some combinations using `[' (see next paragraphs), ## all other special characters, including '\', lose their special significance within ## a bracket expression. # NOTE: In Perl-Regex escaping of all characters within a bracket expression is allowed. $t = 'bracket unescaped'; $s = '.$()|*+?{}\<>'; # $p = '^[.$()|*+?{}\<>]+$'; # POSIX $p = '^[.$()|*+?{}\\<>]+$'; # PCRE printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[.$()|*+?{}\\<>]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abc'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[.$()|*+?{}\\<>]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## Within a bracket expression, a collating element (a character, a multicharacter ## sequence that collates as if it were a single character, or a collating-sequence ## name for either) enclosed in '[.' and '.]' stands for the sequence of ## characters of that collating element. The sequence is a single element of the ## bracket expression's list. A bracket expression containing a multi-character ## collating element can thus match more than one character, e.g. if the collating ## sequence includes a 'ch' collating element, then the RE `[[.ch.]]*c' matches the ## first five characters of 'chchcc'. $t = 'collating element'; $s = 'abcba'; $p = '^[[.abc.]]+$'; # REGEX-COMPILATION-ERROR: Invalid collation character # printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[[.abc.]]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abcbaa'; # printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[[.abc.]]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## Within a bracket expression, a collating element enclosed in '[=' and ## '=]' is an equivalence class, standing for the sequences of characters ## of all collating elements equivalent to that one, including itself. ## (If there are no other equivalent collating elements, the treatment is ## as if the enclosing delimiters were '[.' and '.]'.) For example, if o ## and ^ are the members of an equivalence class, then '[[=o=]]', ## '[[=^=]]', and '[o^]' are all synonymous. An equivalence class may ## not(!) be an endpoint of a range. ## Within a bracket expression, the name of a character class enclosed in ## '[:' and ':]' stands for the list of all characters belonging to that ## class. Standard character class names are: ## alnum digit punct ## alpha graph space ## blank lower upper ## cntrl print xdigit $t = 'named character class'; $s = 'abc123'; $p = '^[[:alnum:]]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[[:alnum:]]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abcbaa.'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[[:alnum:]]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $t = 'named character class'; $s = 'abc'; $p = '^[[:alpha:]]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[[:alpha:]]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abcbaa.'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[[:alpha:]]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $t = 'named character class'; $s = ' '; $p = '^[[:blank:]]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[[:blank:]]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abcbaa.'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[[:blank:]]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $t = 'named character class'; $s = "\t\r\n"; $p = '^[[:cntrl:]]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[[:cntrl:]]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abcbaa.'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[[:cntrl:]]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $t = 'named character class'; $s = '1234567890'; $p = '^[[:digit:]]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[[:digit:]]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abcbaa.'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[[:digit:]]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $t = 'named character class'; $s = 'abc'; $p = '^[[:lower:]]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[[:lower:]]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'ABC'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[[:lower:]]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $t = 'named character class'; $s = 'abc'; $p = '^[[:print:]]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[[:print:]]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = ''; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[[:print:]]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $t = 'named character class'; $s = '!?,;.:'; $p = '^[[:punct:]]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[[:punct:]]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abc1'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[[:punct:]]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $t = 'named character class'; $s = "\t\r\n "; $p = '^[[:space:]]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[[:space:]]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abc1'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[[:space:]]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $t = 'named character class'; $s = 'ABC'; $p = '^[[:upper:]]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[[:upper:]]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'abc'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[[:upper:]]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $t = 'named character class'; $s = '0123456789abcdefABCDEF'; $p = '^[[:xdigit:]]+$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[[:xdigit:]]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'g'; printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /^[[:xdigit:]]+$/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## These stand for the character classes defined in wctype(3). A locale ## may provide others. A character class may not be used as an endpoint ## of a range. ## There are two special cases(!) of bracket expressions: the bracket ## expressions '[[:<:]]' and '[[:>:]]' match the null string at the begin- ## ning and end of a word respectively. A word is defined as a sequence ## of word characters which is neither preceded nor followed by word char- ## acters. A word character is an alnum character (as defined by ## wctype(3)) or an underscore. This is an extension, compatible with but ## not specified by POSIX 1003.2, and should be used with caution in soft- ## ware intended to be portable to other systems. $t = 'Word begin'; $s = ' abcd efg'; $p = '[[:<:]]abc'; # REGEX-COMPILATION-ERROR: Invalid character class name #printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /[[:<:]]abc/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = ' xabcd efg'; #printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /[[:<:]]abc/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $t = 'Word end'; $s = ' abcd efg'; $p = 'bcd[[:>:]]'; # REGEX-COMPILATION-ERROR: Invalid character class name #printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /bcd[[:>:]]/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = ' abcdx efg'; #printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /bcd[[:>:]]/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); $t = 'Word'; $s = ' abcd efg'; $p = '[[:<:]]abcd[[:>:]]'; #printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /[[:<:]]abcd[[:>:]]/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = ' xabcd efg'; #printf("%s %s: \"'%s' !~ /%s/\"\n", ( $s !~ /[[:<:]]abcd[[:>:]]/ ) ? 'PASS' : 'FAIL' ,$t, $s, $p); ## In the event that an RE could match more than one substring of a given ## string, the RE matches the one starting earliest in the string. If the ## RE could match more than one substring starting at that point, it ## matches the longest. Subexpressions also match the longest possible ## substrings, subject to the constraint that the whole match be as long ## as possible, with subexpressions starting earlier in the RE taking pri- ## ority over ones starting later. Note that higher-level subexpressions ## thus take priority over their lower-level component subexpressions. ## Match lengths are measured in characters, not collating elements. A ## null string is considered longer than no match at all. For example, ## 'bb*' matches the three middle characters of 'abbbc', ## '(wee|week)(knights|nights)' matches all ten characters of ## 'weeknights', when '(.*).*' is matched against 'abc' the parenthesized ## subexpression matches all three characters, and when `(a*)*' is matched ## against 'bc' both the whole RE and the parenthesized subexpression ## match the null string. ## If case-independent matching is specified, the effect is much as if all ## case distinctions had vanished from the alphabet. When an alphabetic ## that exists in multiple cases appears as an ordinary character outside ## a bracket expression, it is effectively transformed into a bracket ## expression containing both cases, e.g. 'x' becomes '[xX]'. When it ## appears inside a bracket expression, all case counterparts of it are ## added to the bracket expression, so that (e.g.) '[x]' becomes '[xX]' ## and '[^x]' becomes '[^xX]'. ## No particular limit is imposed on the length of REs(!). Programs ## intended to be portable should not employ REs longer than 256 bytes, as ## an implementation can refuse to accept such REs and remain POSIX-com- ## pliant. ## Finally, there is one new type of atom, a back reference: '\' followed ## by a non-zero decimal digit d matches the same sequence of characters ## matched by the dth parenthesized subexpression (numbering subexpressions ## by the positions of their opening parentheses, left to right), so that ## (e.g.) '\([bc]\)\1' matches 'bb' or 'cc' but not 'bc'. # Metacharacters $t = 'Empty string'; $s = ''; $p = '^$'; printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s = 'a'; printf("%s %s: \"'%s' !~ /%s/\"\n",( $s !~ /^$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); # Character class $t = 'char class meta'; $p = '[.;:*+?\/\\\()!"$^,-~{}=&%@]'; # must be escaped: '('->'\(', '\'->'\\' # This is POSIX and will not work with PCRE $t = 'Word begin'; $s = ' abcd efg'; $p = '\<abc'; #printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /\<abc/) ? 'PASS' : 'FAIL' ,$t, $s, $p); # This is POSIX and will not work with PCRE $t = 'Word end'; $s = ' abcd efg'; $p = 'bcd\>'; #printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /bcd\>/) ? 'PASS' : 'FAIL' ,$t, $s, $p); # This is POSIX and will not work with PCRE $t = 'Word'; $s = ' abcd efg'; $p = '\<abcd\>'; #printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /\<abcd\>/) ? 'PASS' : 'FAIL' ,$t, $s, $p); $s =~ s/\t\n\r\s\S\w\W\d\D//; $t = 'Symbolic character classes'; # do not work $s = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789'; $p = '^[\w]+$'; #printf("%s %s: \"'%s' =~ /%s/\"\n",( $s =~ /^[\w]+$/) ? 'PASS' : 'FAIL' ,$t, $s, $p); # substitution #$s =~ s/abc/xyz/; #$s =~ s/abc/xyz/g; #$s = 'xyz a bc'; #$s =~ s/[a-c]{1,2}/ a /; #$s =~ s/\(a\)/ a /; #$s =~ s/\(a\)/ \1 \n/; $s = 'abc'; #$s =~ s/(a)/ \1 \n/; # does not work #print($s);