1 #
   2 # Secret Labs' Regular Expression Engine
   3 #
   4 # convert template to internal format
   5 #
   6 # Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
   7 #
   8 # See the sre.py file for information on usage and redistribution.
   9 #
  10 
  11 """Internal support module for sre"""
  12 
  13 import _sre, sys
  14 import sre_parse
  15 from sre_constants import *
  16 from _sre import MAXREPEAT
  17 
  18 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
  19 
  20 if _sre.CODESIZE == 2:
  21     MAXCODE = 65535
  22 else:
  23     MAXCODE = 0xFFFFFFFF
  24 
  25 def _identityfunction(x):
  26     return x
  27 
  28 _LITERAL_CODES = set([LITERAL, NOT_LITERAL])
  29 _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
  30 _SUCCESS_CODES = set([SUCCESS, FAILURE])
  31 _ASSERT_CODES = set([ASSERT, ASSERT_NOT])
  32 
  33 def _compile(code, pattern, flags):
  34     # internal: compile a (sub)pattern
  35     emit = code.append
  36     _len = len
  37     LITERAL_CODES = _LITERAL_CODES
  38     REPEATING_CODES = _REPEATING_CODES
  39     SUCCESS_CODES = _SUCCESS_CODES
  40     ASSERT_CODES = _ASSERT_CODES
  41     for op, av in pattern:
  42         if op in LITERAL_CODES:
  43             if flags & SRE_FLAG_IGNORECASE:
  44                 emit(OPCODES[OP_IGNORE[op]])
  45                 emit(_sre.getlower(av, flags))
  46             else:
  47                 emit(OPCODES[op])
  48                 emit(av)
  49         elif op is IN:
  50             if flags & SRE_FLAG_IGNORECASE:
  51                 emit(OPCODES[OP_IGNORE[op]])
  52                 def fixup(literal, flags=flags):
  53                     return _sre.getlower(literal, flags)
  54             else:
  55                 emit(OPCODES[op])
  56                 fixup = _identityfunction
  57             skip = _len(code); emit(0)
  58             _compile_charset(av, flags, code, fixup)
  59             code[skip] = _len(code) - skip
  60         elif op is ANY:
  61             if flags & SRE_FLAG_DOTALL:
  62                 emit(OPCODES[ANY_ALL])
  63             else:
  64                 emit(OPCODES[ANY])
  65         elif op in REPEATING_CODES:
  66             if flags & SRE_FLAG_TEMPLATE:
  67                 raise error("internal: unsupported template operator")
  68             elif _simple(av) and op is not REPEAT:
  69                 if op is MAX_REPEAT:
  70                     emit(OPCODES[REPEAT_ONE])
  71                 else:
  72                     emit(OPCODES[MIN_REPEAT_ONE])
  73                 skip = _len(code); emit(0)
  74                 emit(av[0])
  75                 emit(av[1])
  76                 _compile(code, av[2], flags)
  77                 emit(OPCODES[SUCCESS])
  78                 code[skip] = _len(code) - skip
  79             else:
  80                 emit(OPCODES[REPEAT])
  81                 skip = _len(code); emit(0)
  82                 emit(av[0])
  83                 emit(av[1])
  84                 _compile(code, av[2], flags)
  85                 code[skip] = _len(code) - skip
  86                 if op is MAX_REPEAT:
  87                     emit(OPCODES[MAX_UNTIL])
  88                 else:
  89                     emit(OPCODES[MIN_UNTIL])
  90         elif op is SUBPATTERN:
  91             if av[0]:
  92                 emit(OPCODES[MARK])
  93                 emit((av[0]-1)*2)
  94             # _compile_info(code, av[1], flags)
  95             _compile(code, av[1], flags)
  96             if av[0]:
  97                 emit(OPCODES[MARK])
  98                 emit((av[0]-1)*2+1)
  99         elif op in SUCCESS_CODES:
 100             emit(OPCODES[op])
 101         elif op in ASSERT_CODES:
 102             emit(OPCODES[op])
 103             skip = _len(code); emit(0)
 104             if av[0] >= 0:
 105                 emit(0) # look ahead
 106             else:
 107                 lo, hi = av[1].getwidth()
 108                 if lo != hi:
 109                     raise error("look-behind requires fixed-width pattern")
 110                 emit(lo) # look behind
 111             _compile(code, av[1], flags)
 112             emit(OPCODES[SUCCESS])
 113             code[skip] = _len(code) - skip
 114         elif op is CALL:
 115             emit(OPCODES[op])
 116             skip = _len(code); emit(0)
 117             _compile(code, av, flags)
 118             emit(OPCODES[SUCCESS])
 119             code[skip] = _len(code) - skip
 120         elif op is AT:
 121             emit(OPCODES[op])
 122             if flags & SRE_FLAG_MULTILINE:
 123                 av = AT_MULTILINE.get(av, av)
 124             if flags & SRE_FLAG_LOCALE:
 125                 av = AT_LOCALE.get(av, av)
 126             elif flags & SRE_FLAG_UNICODE:
 127                 av = AT_UNICODE.get(av, av)
 128             emit(ATCODES[av])
 129         elif op is BRANCH:
 130             emit(OPCODES[op])
 131             tail = []
 132             tailappend = tail.append
 133             for av in av[1]:
 134                 skip = _len(code); emit(0)
 135                 # _compile_info(code, av, flags)
 136                 _compile(code, av, flags)
 137                 emit(OPCODES[JUMP])
 138                 tailappend(_len(code)); emit(0)
 139                 code[skip] = _len(code) - skip
 140             emit(0) # end of branch
 141             for tail in tail:
 142                 code[tail] = _len(code) - tail
 143         elif op is CATEGORY:
 144             emit(OPCODES[op])
 145             if flags & SRE_FLAG_LOCALE:
 146                 av = CH_LOCALE[av]
 147             elif flags & SRE_FLAG_UNICODE:
 148                 av = CH_UNICODE[av]
 149             emit(CHCODES[av])
 150         elif op is GROUPREF:
 151             if flags & SRE_FLAG_IGNORECASE:
 152                 emit(OPCODES[OP_IGNORE[op]])
 153             else:
 154                 emit(OPCODES[op])
 155             emit(av-1)
 156         elif op is GROUPREF_EXISTS:
 157             emit(OPCODES[op])
 158             emit(av[0]-1)
 159             skipyes = _len(code); emit(0)
 160             _compile(code, av[1], flags)
 161             if av[2]:
 162                 emit(OPCODES[JUMP])
 163                 skipno = _len(code); emit(0)
 164                 code[skipyes] = _len(code) - skipyes + 1
 165                 _compile(code, av[2], flags)
 166                 code[skipno] = _len(code) - skipno
 167             else:
 168                 code[skipyes] = _len(code) - skipyes + 1
 169         else:
 170             raise ValueError("unsupported operand type", op)
 171 
 172 def _compile_charset(charset, flags, code, fixup=None):
 173     # compile charset subprogram
 174     emit = code.append
 175     if fixup is None:
 176         fixup = _identityfunction
 177     for op, av in _optimize_charset(charset, fixup):
 178         emit(OPCODES[op])
 179         if op is NEGATE:
 180             pass
 181         elif op is LITERAL:
 182             emit(fixup(av))
 183         elif op is RANGE:
 184             emit(fixup(av[0]))
 185             emit(fixup(av[1]))
 186         elif op is CHARSET:
 187             code.extend(av)
 188         elif op is BIGCHARSET:
 189             code.extend(av)
 190         elif op is CATEGORY:
 191             if flags & SRE_FLAG_LOCALE:
 192                 emit(CHCODES[CH_LOCALE[av]])
 193             elif flags & SRE_FLAG_UNICODE:
 194                 emit(CHCODES[CH_UNICODE[av]])
 195             else:
 196                 emit(CHCODES[av])
 197         else:
 198             raise error("internal: unsupported set operator")
 199     emit(OPCODES[FAILURE])
 200 
 201 def _optimize_charset(charset, fixup):
 202     # internal: optimize character set
 203     out = []
 204     tail = []
 205     charmap = bytearray(256)
 206     for op, av in charset:
 207         while True:
 208             try:
 209                 if op is LITERAL:
 210                     charmap[fixup(av)] = 1
 211                 elif op is RANGE:
 212                     for i in range(fixup(av[0]), fixup(av[1])+1):
 213                         charmap[i] = 1
 214                 elif op is NEGATE:
 215                     out.append((op, av))
 216                 else:
 217                     tail.append((op, av))
 218             except IndexError:
 219                 if len(charmap) == 256:
 220                     # character set contains non-UCS1 character codes
 221                     charmap += b'\0' * 0xff00
 222                     continue
 223                 # character set contains non-BMP character codes
 224                 tail.append((op, av))
 225             break
 226 
 227     # compress character map
 228     runs = []
 229     q = 0
 230     while True:
 231         p = charmap.find(1, q)
 232         if p < 0:
 233             break
 234         if len(runs) >= 2:
 235             runs = None
 236             break
 237         q = charmap.find(0, p)
 238         if q < 0:
 239             runs.append((p, len(charmap)))
 240             break
 241         runs.append((p, q))
 242     if runs is not None:
 243         # use literal/range
 244         for p, q in runs:
 245             if q - p == 1:
 246                 out.append((LITERAL, p))
 247             else:
 248                 out.append((RANGE, (p, q - 1)))
 249         out += tail
 250         if len(out) < len(charset):
 251             return out
 252         return charset
 253 
 254     # use bitmap
 255     if len(charmap) == 256:
 256         data = _mk_bitmap(charmap)
 257         out.append((CHARSET, data))
 258         out += tail
 259         return out
 260 
 261     # To represent a big charset, first a bitmap of all characters in the
 262     # set is constructed. Then, this bitmap is sliced into chunks of 256
 263     # characters, duplicate chunks are eliminated, and each chunk is
 264     # given a number. In the compiled expression, the charset is
 265     # represented by a 32-bit word sequence, consisting of one word for
 266     # the number of different chunks, a sequence of 256 bytes (64 words)
 267     # of chunk numbers indexed by their original chunk position, and a
 268     # sequence of 256-bit chunks (8 words each).
 269 
 270     # Compression is normally good: in a typical charset, large ranges of
 271     # Unicode will be either completely excluded (e.g. if only cyrillic
 272     # letters are to be matched), or completely included (e.g. if large
 273     # subranges of Kanji match). These ranges will be represented by
 274     # chunks of all one-bits or all zero-bits.
 275 
 276     # Matching can be also done efficiently: the more significant byte of
 277     # the Unicode character is an index into the chunk number, and the
 278     # less significant byte is a bit index in the chunk (just like the
 279     # CHARSET matching).
 280 
 281     charmap = bytes(charmap) # should be hashable
 282     comps = {}
 283     mapping = bytearray(256)
 284     block = 0
 285     data = bytearray()
 286     for i in range(0, 65536, 256):
 287         chunk = charmap[i: i + 256]
 288         if chunk in comps:
 289             mapping[i // 256] = comps[chunk]
 290         else:
 291             mapping[i // 256] = comps[chunk] = block
 292             block += 1
 293             data += chunk
 294     data = _mk_bitmap(data)
 295     data[0:0] = [block] + _bytes_to_codes(mapping)
 296     out.append((BIGCHARSET, data))
 297     out += tail
 298     return out
 299 
 300 _CODEBITS = _sre.CODESIZE * 8
 301 _BITS_TRANS = b'0' + b'1' * 255
 302 def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
 303     s = bits.translate(_BITS_TRANS)[::-1]
 304     return [_int(s[i - _CODEBITS: i], 2)
 305             for i in range(len(s), 0, -_CODEBITS)]
 306 
 307 def _bytes_to_codes(b):
 308     # Convert block indices to word array
 309     import array
 310     a = array.array('I', b)
 311     assert a.itemsize == _sre.CODESIZE
 312     assert len(a) * a.itemsize == len(b)
 313     return a.tolist()
 314 
 315 def _simple(av):
 316     # check if av is a "simple" operator
 317     lo, hi = av[2].getwidth()
 318     return lo == hi == 1 and av[2][0][0] != SUBPATTERN
 319 
 320 def _generate_overlap_table(prefix):
 321     """
 322     Generate an overlap table for the following prefix.
 323     An overlap table is a table of the same size as the prefix which
 324     informs about the potential self-overlap for each index in the prefix:
 325     - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
 326     - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
 327       prefix[0:k]
 328     """
 329     table = [0] * len(prefix)
 330     for i in range(1, len(prefix)):
 331         idx = table[i - 1]
 332         while prefix[i] != prefix[idx]:
 333             if idx == 0:
 334                 table[i] = 0
 335                 break
 336             idx = table[idx - 1]
 337         else:
 338             table[i] = idx + 1
 339     return table
 340 
 341 def _compile_info(code, pattern, flags):
 342     # internal: compile an info block.  in the current version,
 343     # this contains min/max pattern width, and an optional literal
 344     # prefix or a character map
 345     lo, hi = pattern.getwidth()
 346     if lo == 0:
 347         return # not worth it
 348     # look for a literal prefix
 349     prefix = []
 350     prefixappend = prefix.append
 351     prefix_skip = 0
 352     charset = [] # not used
 353     charsetappend = charset.append
 354     if not (flags & SRE_FLAG_IGNORECASE):
 355         # look for literal prefix
 356         for op, av in pattern.data:
 357             if op is LITERAL:
 358                 if len(prefix) == prefix_skip:
 359                     prefix_skip = prefix_skip + 1
 360                 prefixappend(av)
 361             elif op is SUBPATTERN and len(av[1]) == 1:
 362                 op, av = av[1][0]
 363                 if op is LITERAL:
 364                     prefixappend(av)
 365                 else:
 366                     break
 367             else:
 368                 break
 369         # if no prefix, look for charset prefix
 370         if not prefix and pattern.data:
 371             op, av = pattern.data[0]
 372             if op is SUBPATTERN and av[1]:
 373                 op, av = av[1][0]
 374                 if op is LITERAL:
 375                     charsetappend((op, av))
 376                 elif op is BRANCH:
 377                     c = []
 378                     cappend = c.append
 379                     for p in av[1]:
 380                         if not p:
 381                             break
 382                         op, av = p[0]
 383                         if op is LITERAL:
 384                             cappend((op, av))
 385                         else:
 386                             break
 387                     else:
 388                         charset = c
 389             elif op is BRANCH:
 390                 c = []
 391                 cappend = c.append
 392                 for p in av[1]:
 393                     if not p:
 394                         break
 395                     op, av = p[0]
 396                     if op is LITERAL:
 397                         cappend((op, av))
 398                     else:
 399                         break
 400                 else:
 401                     charset = c
 402             elif op is IN:
 403                 charset = av
 404 ##     if prefix:
 405 ##         print "*** PREFIX", prefix, prefix_skip
 406 ##     if charset:
 407 ##         print "*** CHARSET", charset
 408     # add an info block
 409     emit = code.append
 410     emit(OPCODES[INFO])
 411     skip = len(code); emit(0)
 412     # literal flag
 413     mask = 0
 414     if prefix:
 415         mask = SRE_INFO_PREFIX
 416         if len(prefix) == prefix_skip == len(pattern.data):
 417             mask = mask + SRE_INFO_LITERAL
 418     elif charset:
 419         mask = mask + SRE_INFO_CHARSET
 420     emit(mask)
 421     # pattern length
 422     if lo < MAXCODE:
 423         emit(lo)
 424     else:
 425         emit(MAXCODE)
 426         prefix = prefix[:MAXCODE]
 427     if hi < MAXCODE:
 428         emit(hi)
 429     else:
 430         emit(0)
 431     # add literal prefix
 432     if prefix:
 433         emit(len(prefix)) # length
 434         emit(prefix_skip) # skip
 435         code.extend(prefix)
 436         # generate overlap table
 437         code.extend(_generate_overlap_table(prefix))
 438     elif charset:
 439         _compile_charset(charset, flags, code)
 440     code[skip] = len(code) - skip
 441 
 442 def isstring(obj):
 443     return isinstance(obj, (str, bytes))
 444 
 445 def _code(p, flags):
 446 
 447     flags = p.pattern.flags | flags
 448     code = []
 449 
 450     # compile info block
 451     _compile_info(code, p, flags)
 452 
 453     # compile the pattern
 454     _compile(code, p.data, flags)
 455 
 456     code.append(OPCODES[SUCCESS])
 457 
 458     return code
 459 
 460 def compile(p, flags=0):
 461     # internal: convert pattern list to internal format
 462 
 463     if isstring(p):
 464         pattern = p
 465         p = sre_parse.parse(p, flags)
 466     else:
 467         pattern = None
 468 
 469     code = _code(p, flags)
 470 
 471     # print code
 472 
 473     # XXX: <fl> get rid of this limitation!
 474     if p.pattern.groups > 100:
 475         raise AssertionError(
 476             "sorry, but this version only supports 100 named groups"
 477             )
 478 
 479     # map in either direction
 480     groupindex = p.pattern.groupdict
 481     indexgroup = [None] * p.pattern.groups
 482     for k, i in groupindex.items():
 483         indexgroup[i] = k
 484 
 485     return _sre.compile(
 486         pattern, flags | p.pattern.flags, code,
 487         p.pattern.groups-1,
 488         groupindex, indexgroup
 489         )