Source file src/regexp/syntax/parse.go
1
2
3
4
5 package syntax
6
7 import (
8 "sort"
9 "strings"
10 "unicode"
11 "unicode/utf8"
12 )
13
14
15
16 type Error struct {
17 Code ErrorCode
18 Expr string
19 }
20
21 func (e *Error) Error() string {
22 return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
23 }
24
25
26 type ErrorCode string
27
28 const (
29
30 ErrInternalError ErrorCode = "regexp/syntax: internal error"
31
32
33 ErrInvalidCharClass ErrorCode = "invalid character class"
34 ErrInvalidCharRange ErrorCode = "invalid character class range"
35 ErrInvalidEscape ErrorCode = "invalid escape sequence"
36 ErrInvalidNamedCapture ErrorCode = "invalid named capture"
37 ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"
38 ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"
39 ErrInvalidRepeatSize ErrorCode = "invalid repeat count"
40 ErrInvalidUTF8 ErrorCode = "invalid UTF-8"
41 ErrMissingBracket ErrorCode = "missing closing ]"
42 ErrMissingParen ErrorCode = "missing closing )"
43 ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
44 ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"
45 ErrUnexpectedParen ErrorCode = "unexpected )"
46 )
47
48 func (e ErrorCode) String() string {
49 return string(e)
50 }
51
52
53 type Flags uint16
54
55 const (
56 FoldCase Flags = 1 << iota
57 Literal
58 ClassNL
59 DotNL
60 OneLine
61 NonGreedy
62 PerlX
63 UnicodeGroups
64 WasDollar
65 Simple
66
67 MatchNL = ClassNL | DotNL
68
69 Perl = ClassNL | OneLine | PerlX | UnicodeGroups
70 POSIX Flags = 0
71 )
72
73
74 const (
75 opLeftParen = opPseudo + iota
76 opVerticalBar
77 )
78
79 type parser struct {
80 flags Flags
81 stack []*Regexp
82 free *Regexp
83 numCap int
84 wholeRegexp string
85 tmpClass []rune
86 }
87
88 func (p *parser) newRegexp(op Op) *Regexp {
89 re := p.free
90 if re != nil {
91 p.free = re.Sub0[0]
92 *re = Regexp{}
93 } else {
94 re = new(Regexp)
95 }
96 re.Op = op
97 return re
98 }
99
100 func (p *parser) reuse(re *Regexp) {
101 re.Sub0[0] = p.free
102 p.free = re
103 }
104
105
106
107
108 func (p *parser) push(re *Regexp) *Regexp {
109 if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
110
111 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
112 return nil
113 }
114 re.Op = OpLiteral
115 re.Rune = re.Rune[:1]
116 re.Flags = p.flags &^ FoldCase
117 } else if re.Op == OpCharClass && len(re.Rune) == 4 &&
118 re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
119 unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
120 unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
121 re.Op == OpCharClass && len(re.Rune) == 2 &&
122 re.Rune[0]+1 == re.Rune[1] &&
123 unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
124 unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
125
126 if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
127 return nil
128 }
129
130
131 re.Op = OpLiteral
132 re.Rune = re.Rune[:1]
133 re.Flags = p.flags | FoldCase
134 } else {
135
136 p.maybeConcat(-1, 0)
137 }
138
139 p.stack = append(p.stack, re)
140 return re
141 }
142
143
144
145
146
147
148
149
150
151
152 func (p *parser) maybeConcat(r rune, flags Flags) bool {
153 n := len(p.stack)
154 if n < 2 {
155 return false
156 }
157
158 re1 := p.stack[n-1]
159 re2 := p.stack[n-2]
160 if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
161 return false
162 }
163
164
165 re2.Rune = append(re2.Rune, re1.Rune...)
166
167
168 if r >= 0 {
169 re1.Rune = re1.Rune0[:1]
170 re1.Rune[0] = r
171 re1.Flags = flags
172 return true
173 }
174
175 p.stack = p.stack[:n-1]
176 p.reuse(re1)
177 return false
178 }
179
180
181 func (p *parser) newLiteral(r rune, flags Flags) *Regexp {
182 re := p.newRegexp(OpLiteral)
183 re.Flags = flags
184 if flags&FoldCase != 0 {
185 r = minFoldRune(r)
186 }
187 re.Rune0[0] = r
188 re.Rune = re.Rune0[:1]
189 return re
190 }
191
192
193 func minFoldRune(r rune) rune {
194 if r < minFold || r > maxFold {
195 return r
196 }
197 min := r
198 r0 := r
199 for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
200 if min > r {
201 min = r
202 }
203 }
204 return min
205 }
206
207
208
209 func (p *parser) literal(r rune) {
210 p.push(p.newLiteral(r, p.flags))
211 }
212
213
214
215 func (p *parser) op(op Op) *Regexp {
216 re := p.newRegexp(op)
217 re.Flags = p.flags
218 return p.push(re)
219 }
220
221
222
223
224
225 func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
226 flags := p.flags
227 if p.flags&PerlX != 0 {
228 if len(after) > 0 && after[0] == '?' {
229 after = after[1:]
230 flags ^= NonGreedy
231 }
232 if lastRepeat != "" {
233
234
235
236 return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
237 }
238 }
239 n := len(p.stack)
240 if n == 0 {
241 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
242 }
243 sub := p.stack[n-1]
244 if sub.Op >= opPseudo {
245 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
246 }
247
248 re := p.newRegexp(op)
249 re.Min = min
250 re.Max = max
251 re.Flags = flags
252 re.Sub = re.Sub0[:1]
253 re.Sub[0] = sub
254 p.stack[n-1] = re
255
256 if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) {
257 return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
258 }
259
260 return after, nil
261 }
262
263
264
265
266
267
268
269
270
271
272 func repeatIsValid(re *Regexp, n int) bool {
273 if re.Op == OpRepeat {
274 m := re.Max
275 if m == 0 {
276 return true
277 }
278 if m < 0 {
279 m = re.Min
280 }
281 if m > n {
282 return false
283 }
284 if m > 0 {
285 n /= m
286 }
287 }
288 for _, sub := range re.Sub {
289 if !repeatIsValid(sub, n) {
290 return false
291 }
292 }
293 return true
294 }
295
296
297 func (p *parser) concat() *Regexp {
298 p.maybeConcat(-1, 0)
299
300
301 i := len(p.stack)
302 for i > 0 && p.stack[i-1].Op < opPseudo {
303 i--
304 }
305 subs := p.stack[i:]
306 p.stack = p.stack[:i]
307
308
309 if len(subs) == 0 {
310 return p.push(p.newRegexp(OpEmptyMatch))
311 }
312
313 return p.push(p.collapse(subs, OpConcat))
314 }
315
316
317 func (p *parser) alternate() *Regexp {
318
319
320 i := len(p.stack)
321 for i > 0 && p.stack[i-1].Op < opPseudo {
322 i--
323 }
324 subs := p.stack[i:]
325 p.stack = p.stack[:i]
326
327
328
329 if len(subs) > 0 {
330 cleanAlt(subs[len(subs)-1])
331 }
332
333
334
335 if len(subs) == 0 {
336 return p.push(p.newRegexp(OpNoMatch))
337 }
338
339 return p.push(p.collapse(subs, OpAlternate))
340 }
341
342
343 func cleanAlt(re *Regexp) {
344 switch re.Op {
345 case OpCharClass:
346 re.Rune = cleanClass(&re.Rune)
347 if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
348 re.Rune = nil
349 re.Op = OpAnyChar
350 return
351 }
352 if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
353 re.Rune = nil
354 re.Op = OpAnyCharNotNL
355 return
356 }
357 if cap(re.Rune)-len(re.Rune) > 100 {
358
359
360 re.Rune = append(re.Rune0[:0], re.Rune...)
361 }
362 }
363 }
364
365
366
367
368
369 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
370 if len(subs) == 1 {
371 return subs[0]
372 }
373 re := p.newRegexp(op)
374 re.Sub = re.Sub0[:0]
375 for _, sub := range subs {
376 if sub.Op == op {
377 re.Sub = append(re.Sub, sub.Sub...)
378 p.reuse(sub)
379 } else {
380 re.Sub = append(re.Sub, sub)
381 }
382 }
383 if op == OpAlternate {
384 re.Sub = p.factor(re.Sub)
385 if len(re.Sub) == 1 {
386 old := re
387 re = re.Sub[0]
388 p.reuse(old)
389 }
390 }
391 return re
392 }
393
394
395
396
397
398
399
400
401
402
403
404
405 func (p *parser) factor(sub []*Regexp) []*Regexp {
406 if len(sub) < 2 {
407 return sub
408 }
409
410
411 var str []rune
412 var strflags Flags
413 start := 0
414 out := sub[:0]
415 for i := 0; i <= len(sub); i++ {
416
417
418
419
420
421
422 var istr []rune
423 var iflags Flags
424 if i < len(sub) {
425 istr, iflags = p.leadingString(sub[i])
426 if iflags == strflags {
427 same := 0
428 for same < len(str) && same < len(istr) && str[same] == istr[same] {
429 same++
430 }
431 if same > 0 {
432
433
434 str = str[:same]
435 continue
436 }
437 }
438 }
439
440
441
442
443
444
445 if i == start {
446
447 } else if i == start+1 {
448
449 out = append(out, sub[start])
450 } else {
451
452 prefix := p.newRegexp(OpLiteral)
453 prefix.Flags = strflags
454 prefix.Rune = append(prefix.Rune[:0], str...)
455
456 for j := start; j < i; j++ {
457 sub[j] = p.removeLeadingString(sub[j], len(str))
458 }
459 suffix := p.collapse(sub[start:i], OpAlternate)
460
461 re := p.newRegexp(OpConcat)
462 re.Sub = append(re.Sub[:0], prefix, suffix)
463 out = append(out, re)
464 }
465
466
467 start = i
468 str = istr
469 strflags = iflags
470 }
471 sub = out
472
473
474
475
476
477
478
479
480
481 start = 0
482 out = sub[:0]
483 var first *Regexp
484 for i := 0; i <= len(sub); i++ {
485
486
487
488
489
490 var ifirst *Regexp
491 if i < len(sub) {
492 ifirst = p.leadingRegexp(sub[i])
493 if first != nil && first.Equal(ifirst) &&
494
495 (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) {
496 continue
497 }
498 }
499
500
501
502
503
504 if i == start {
505
506 } else if i == start+1 {
507
508 out = append(out, sub[start])
509 } else {
510
511 prefix := first
512 for j := start; j < i; j++ {
513 reuse := j != start
514 sub[j] = p.removeLeadingRegexp(sub[j], reuse)
515 }
516 suffix := p.collapse(sub[start:i], OpAlternate)
517
518 re := p.newRegexp(OpConcat)
519 re.Sub = append(re.Sub[:0], prefix, suffix)
520 out = append(out, re)
521 }
522
523
524 start = i
525 first = ifirst
526 }
527 sub = out
528
529
530 start = 0
531 out = sub[:0]
532 for i := 0; i <= len(sub); i++ {
533
534
535
536
537
538
539 if i < len(sub) && isCharClass(sub[i]) {
540 continue
541 }
542
543
544
545 if i == start {
546
547 } else if i == start+1 {
548 out = append(out, sub[start])
549 } else {
550
551
552 max := start
553 for j := start + 1; j < i; j++ {
554 if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
555 max = j
556 }
557 }
558 sub[start], sub[max] = sub[max], sub[start]
559
560 for j := start + 1; j < i; j++ {
561 mergeCharClass(sub[start], sub[j])
562 p.reuse(sub[j])
563 }
564 cleanAlt(sub[start])
565 out = append(out, sub[start])
566 }
567
568
569 if i < len(sub) {
570 out = append(out, sub[i])
571 }
572 start = i + 1
573 }
574 sub = out
575
576
577 start = 0
578 out = sub[:0]
579 for i := range sub {
580 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
581 continue
582 }
583 out = append(out, sub[i])
584 }
585 sub = out
586
587 return sub
588 }
589
590
591
592 func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
593 if re.Op == OpConcat && len(re.Sub) > 0 {
594 re = re.Sub[0]
595 }
596 if re.Op != OpLiteral {
597 return nil, 0
598 }
599 return re.Rune, re.Flags & FoldCase
600 }
601
602
603
604 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
605 if re.Op == OpConcat && len(re.Sub) > 0 {
606
607
608 sub := re.Sub[0]
609 sub = p.removeLeadingString(sub, n)
610 re.Sub[0] = sub
611 if sub.Op == OpEmptyMatch {
612 p.reuse(sub)
613 switch len(re.Sub) {
614 case 0, 1:
615
616 re.Op = OpEmptyMatch
617 re.Sub = nil
618 case 2:
619 old := re
620 re = re.Sub[1]
621 p.reuse(old)
622 default:
623 copy(re.Sub, re.Sub[1:])
624 re.Sub = re.Sub[:len(re.Sub)-1]
625 }
626 }
627 return re
628 }
629
630 if re.Op == OpLiteral {
631 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
632 if len(re.Rune) == 0 {
633 re.Op = OpEmptyMatch
634 }
635 }
636 return re
637 }
638
639
640
641 func (p *parser) leadingRegexp(re *Regexp) *Regexp {
642 if re.Op == OpEmptyMatch {
643 return nil
644 }
645 if re.Op == OpConcat && len(re.Sub) > 0 {
646 sub := re.Sub[0]
647 if sub.Op == OpEmptyMatch {
648 return nil
649 }
650 return sub
651 }
652 return re
653 }
654
655
656
657
658 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
659 if re.Op == OpConcat && len(re.Sub) > 0 {
660 if reuse {
661 p.reuse(re.Sub[0])
662 }
663 re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
664 switch len(re.Sub) {
665 case 0:
666 re.Op = OpEmptyMatch
667 re.Sub = nil
668 case 1:
669 old := re
670 re = re.Sub[0]
671 p.reuse(old)
672 }
673 return re
674 }
675 if reuse {
676 p.reuse(re)
677 }
678 return p.newRegexp(OpEmptyMatch)
679 }
680
681 func literalRegexp(s string, flags Flags) *Regexp {
682 re := &Regexp{Op: OpLiteral}
683 re.Flags = flags
684 re.Rune = re.Rune0[:0]
685 for _, c := range s {
686 if len(re.Rune) >= cap(re.Rune) {
687
688 re.Rune = []rune(s)
689 break
690 }
691 re.Rune = append(re.Rune, c)
692 }
693 return re
694 }
695
696
697
698
699
700
701 func Parse(s string, flags Flags) (*Regexp, error) {
702 if flags&Literal != 0 {
703
704 if err := checkUTF8(s); err != nil {
705 return nil, err
706 }
707 return literalRegexp(s, flags), nil
708 }
709
710
711 var (
712 p parser
713 err error
714 c rune
715 op Op
716 lastRepeat string
717 )
718 p.flags = flags
719 p.wholeRegexp = s
720 t := s
721 for t != "" {
722 repeat := ""
723 BigSwitch:
724 switch t[0] {
725 default:
726 if c, t, err = nextRune(t); err != nil {
727 return nil, err
728 }
729 p.literal(c)
730
731 case '(':
732 if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
733
734 if t, err = p.parsePerlFlags(t); err != nil {
735 return nil, err
736 }
737 break
738 }
739 p.numCap++
740 p.op(opLeftParen).Cap = p.numCap
741 t = t[1:]
742 case '|':
743 if err = p.parseVerticalBar(); err != nil {
744 return nil, err
745 }
746 t = t[1:]
747 case ')':
748 if err = p.parseRightParen(); err != nil {
749 return nil, err
750 }
751 t = t[1:]
752 case '^':
753 if p.flags&OneLine != 0 {
754 p.op(OpBeginText)
755 } else {
756 p.op(OpBeginLine)
757 }
758 t = t[1:]
759 case '$':
760 if p.flags&OneLine != 0 {
761 p.op(OpEndText).Flags |= WasDollar
762 } else {
763 p.op(OpEndLine)
764 }
765 t = t[1:]
766 case '.':
767 if p.flags&DotNL != 0 {
768 p.op(OpAnyChar)
769 } else {
770 p.op(OpAnyCharNotNL)
771 }
772 t = t[1:]
773 case '[':
774 if t, err = p.parseClass(t); err != nil {
775 return nil, err
776 }
777 case '*', '+', '?':
778 before := t
779 switch t[0] {
780 case '*':
781 op = OpStar
782 case '+':
783 op = OpPlus
784 case '?':
785 op = OpQuest
786 }
787 after := t[1:]
788 if after, err = p.repeat(op, 0, 0, before, after, lastRepeat); err != nil {
789 return nil, err
790 }
791 repeat = before
792 t = after
793 case '{':
794 op = OpRepeat
795 before := t
796 min, max, after, ok := p.parseRepeat(t)
797 if !ok {
798
799 p.literal('{')
800 t = t[1:]
801 break
802 }
803 if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
804
805 return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
806 }
807 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
808 return nil, err
809 }
810 repeat = before
811 t = after
812 case '\\':
813 if p.flags&PerlX != 0 && len(t) >= 2 {
814 switch t[1] {
815 case 'A':
816 p.op(OpBeginText)
817 t = t[2:]
818 break BigSwitch
819 case 'b':
820 p.op(OpWordBoundary)
821 t = t[2:]
822 break BigSwitch
823 case 'B':
824 p.op(OpNoWordBoundary)
825 t = t[2:]
826 break BigSwitch
827 case 'C':
828
829 return nil, &Error{ErrInvalidEscape, t[:2]}
830 case 'Q':
831
832 var lit string
833 if i := strings.Index(t, `\E`); i < 0 {
834 lit = t[2:]
835 t = ""
836 } else {
837 lit = t[2:i]
838 t = t[i+2:]
839 }
840 for lit != "" {
841 c, rest, err := nextRune(lit)
842 if err != nil {
843 return nil, err
844 }
845 p.literal(c)
846 lit = rest
847 }
848 break BigSwitch
849 case 'z':
850 p.op(OpEndText)
851 t = t[2:]
852 break BigSwitch
853 }
854 }
855
856 re := p.newRegexp(OpCharClass)
857 re.Flags = p.flags
858
859
860 if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
861 r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
862 if err != nil {
863 return nil, err
864 }
865 if r != nil {
866 re.Rune = r
867 t = rest
868 p.push(re)
869 break BigSwitch
870 }
871 }
872
873
874 if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
875 re.Rune = r
876 t = rest
877 p.push(re)
878 break BigSwitch
879 }
880 p.reuse(re)
881
882
883 if c, t, err = p.parseEscape(t); err != nil {
884 return nil, err
885 }
886 p.literal(c)
887 }
888 lastRepeat = repeat
889 }
890
891 p.concat()
892 if p.swapVerticalBar() {
893
894 p.stack = p.stack[:len(p.stack)-1]
895 }
896 p.alternate()
897
898 n := len(p.stack)
899 if n != 1 {
900 return nil, &Error{ErrMissingParen, s}
901 }
902 return p.stack[0], nil
903 }
904
905
906
907
908 func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
909 if s == "" || s[0] != '{' {
910 return
911 }
912 s = s[1:]
913 var ok1 bool
914 if min, s, ok1 = p.parseInt(s); !ok1 {
915 return
916 }
917 if s == "" {
918 return
919 }
920 if s[0] != ',' {
921 max = min
922 } else {
923 s = s[1:]
924 if s == "" {
925 return
926 }
927 if s[0] == '}' {
928 max = -1
929 } else if max, s, ok1 = p.parseInt(s); !ok1 {
930 return
931 } else if max < 0 {
932
933 min = -1
934 }
935 }
936 if s == "" || s[0] != '}' {
937 return
938 }
939 rest = s[1:]
940 ok = true
941 return
942 }
943
944
945
946
947 func (p *parser) parsePerlFlags(s string) (rest string, err error) {
948 t := s
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965 if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
966
967 end := strings.IndexRune(t, '>')
968 if end < 0 {
969 if err = checkUTF8(t); err != nil {
970 return "", err
971 }
972 return "", &Error{ErrInvalidNamedCapture, s}
973 }
974
975 capture := t[:end+1]
976 name := t[4:end]
977 if err = checkUTF8(name); err != nil {
978 return "", err
979 }
980 if !isValidCaptureName(name) {
981 return "", &Error{ErrInvalidNamedCapture, capture}
982 }
983
984
985 p.numCap++
986 re := p.op(opLeftParen)
987 re.Cap = p.numCap
988 re.Name = name
989 return t[end+1:], nil
990 }
991
992
993 var c rune
994 t = t[2:]
995 flags := p.flags
996 sign := +1
997 sawFlag := false
998 Loop:
999 for t != "" {
1000 if c, t, err = nextRune(t); err != nil {
1001 return "", err
1002 }
1003 switch c {
1004 default:
1005 break Loop
1006
1007
1008 case 'i':
1009 flags |= FoldCase
1010 sawFlag = true
1011 case 'm':
1012 flags &^= OneLine
1013 sawFlag = true
1014 case 's':
1015 flags |= DotNL
1016 sawFlag = true
1017 case 'U':
1018 flags |= NonGreedy
1019 sawFlag = true
1020
1021
1022 case '-':
1023 if sign < 0 {
1024 break Loop
1025 }
1026 sign = -1
1027
1028
1029 flags = ^flags
1030 sawFlag = false
1031
1032
1033 case ':', ')':
1034 if sign < 0 {
1035 if !sawFlag {
1036 break Loop
1037 }
1038 flags = ^flags
1039 }
1040 if c == ':' {
1041
1042 p.op(opLeftParen)
1043 }
1044 p.flags = flags
1045 return t, nil
1046 }
1047 }
1048
1049 return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
1050 }
1051
1052
1053
1054
1055
1056
1057 func isValidCaptureName(name string) bool {
1058 if name == "" {
1059 return false
1060 }
1061 for _, c := range name {
1062 if c != '_' && !isalnum(c) {
1063 return false
1064 }
1065 }
1066 return true
1067 }
1068
1069
1070 func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
1071 if s == "" || s[0] < '0' || '9' < s[0] {
1072 return
1073 }
1074
1075 if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
1076 return
1077 }
1078 t := s
1079 for s != "" && '0' <= s[0] && s[0] <= '9' {
1080 s = s[1:]
1081 }
1082 rest = s
1083 ok = true
1084
1085 t = t[:len(t)-len(s)]
1086 for i := 0; i < len(t); i++ {
1087
1088 if n >= 1e8 {
1089 n = -1
1090 break
1091 }
1092 n = n*10 + int(t[i]) - '0'
1093 }
1094 return
1095 }
1096
1097
1098
1099 func isCharClass(re *Regexp) bool {
1100 return re.Op == OpLiteral && len(re.Rune) == 1 ||
1101 re.Op == OpCharClass ||
1102 re.Op == OpAnyCharNotNL ||
1103 re.Op == OpAnyChar
1104 }
1105
1106
1107 func matchRune(re *Regexp, r rune) bool {
1108 switch re.Op {
1109 case OpLiteral:
1110 return len(re.Rune) == 1 && re.Rune[0] == r
1111 case OpCharClass:
1112 for i := 0; i < len(re.Rune); i += 2 {
1113 if re.Rune[i] <= r && r <= re.Rune[i+1] {
1114 return true
1115 }
1116 }
1117 return false
1118 case OpAnyCharNotNL:
1119 return r != '\n'
1120 case OpAnyChar:
1121 return true
1122 }
1123 return false
1124 }
1125
1126
1127 func (p *parser) parseVerticalBar() error {
1128 p.concat()
1129
1130
1131
1132
1133
1134 if !p.swapVerticalBar() {
1135 p.op(opVerticalBar)
1136 }
1137
1138 return nil
1139 }
1140
1141
1142
1143
1144 func mergeCharClass(dst, src *Regexp) {
1145 switch dst.Op {
1146 case OpAnyChar:
1147
1148 case OpAnyCharNotNL:
1149
1150 if matchRune(src, '\n') {
1151 dst.Op = OpAnyChar
1152 }
1153 case OpCharClass:
1154
1155 if src.Op == OpLiteral {
1156 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1157 } else {
1158 dst.Rune = appendClass(dst.Rune, src.Rune)
1159 }
1160 case OpLiteral:
1161
1162 if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
1163 break
1164 }
1165 dst.Op = OpCharClass
1166 dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
1167 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1168 }
1169 }
1170
1171
1172
1173
1174 func (p *parser) swapVerticalBar() bool {
1175
1176
1177 n := len(p.stack)
1178 if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
1179 re1 := p.stack[n-1]
1180 re3 := p.stack[n-3]
1181
1182 if re1.Op > re3.Op {
1183 re1, re3 = re3, re1
1184 p.stack[n-3] = re3
1185 }
1186 mergeCharClass(re3, re1)
1187 p.reuse(re1)
1188 p.stack = p.stack[:n-1]
1189 return true
1190 }
1191
1192 if n >= 2 {
1193 re1 := p.stack[n-1]
1194 re2 := p.stack[n-2]
1195 if re2.Op == opVerticalBar {
1196 if n >= 3 {
1197
1198
1199 cleanAlt(p.stack[n-3])
1200 }
1201 p.stack[n-2] = re1
1202 p.stack[n-1] = re2
1203 return true
1204 }
1205 }
1206 return false
1207 }
1208
1209
1210 func (p *parser) parseRightParen() error {
1211 p.concat()
1212 if p.swapVerticalBar() {
1213
1214 p.stack = p.stack[:len(p.stack)-1]
1215 }
1216 p.alternate()
1217
1218 n := len(p.stack)
1219 if n < 2 {
1220 return &Error{ErrUnexpectedParen, p.wholeRegexp}
1221 }
1222 re1 := p.stack[n-1]
1223 re2 := p.stack[n-2]
1224 p.stack = p.stack[:n-2]
1225 if re2.Op != opLeftParen {
1226 return &Error{ErrUnexpectedParen, p.wholeRegexp}
1227 }
1228
1229 p.flags = re2.Flags
1230 if re2.Cap == 0 {
1231
1232 p.push(re1)
1233 } else {
1234 re2.Op = OpCapture
1235 re2.Sub = re2.Sub0[:1]
1236 re2.Sub[0] = re1
1237 p.push(re2)
1238 }
1239 return nil
1240 }
1241
1242
1243
1244 func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
1245 t := s[1:]
1246 if t == "" {
1247 return 0, "", &Error{ErrTrailingBackslash, ""}
1248 }
1249 c, t, err := nextRune(t)
1250 if err != nil {
1251 return 0, "", err
1252 }
1253
1254 Switch:
1255 switch c {
1256 default:
1257 if c < utf8.RuneSelf && !isalnum(c) {
1258
1259
1260
1261
1262 return c, t, nil
1263 }
1264
1265
1266 case '1', '2', '3', '4', '5', '6', '7':
1267
1268 if t == "" || t[0] < '0' || t[0] > '7' {
1269 break
1270 }
1271 fallthrough
1272 case '0':
1273
1274 r = c - '0'
1275 for i := 1; i < 3; i++ {
1276 if t == "" || t[0] < '0' || t[0] > '7' {
1277 break
1278 }
1279 r = r*8 + rune(t[0]) - '0'
1280 t = t[1:]
1281 }
1282 return r, t, nil
1283
1284
1285 case 'x':
1286 if t == "" {
1287 break
1288 }
1289 if c, t, err = nextRune(t); err != nil {
1290 return 0, "", err
1291 }
1292 if c == '{' {
1293
1294
1295
1296
1297 nhex := 0
1298 r = 0
1299 for {
1300 if t == "" {
1301 break Switch
1302 }
1303 if c, t, err = nextRune(t); err != nil {
1304 return 0, "", err
1305 }
1306 if c == '}' {
1307 break
1308 }
1309 v := unhex(c)
1310 if v < 0 {
1311 break Switch
1312 }
1313 r = r*16 + v
1314 if r > unicode.MaxRune {
1315 break Switch
1316 }
1317 nhex++
1318 }
1319 if nhex == 0 {
1320 break Switch
1321 }
1322 return r, t, nil
1323 }
1324
1325
1326 x := unhex(c)
1327 if c, t, err = nextRune(t); err != nil {
1328 return 0, "", err
1329 }
1330 y := unhex(c)
1331 if x < 0 || y < 0 {
1332 break
1333 }
1334 return x*16 + y, t, nil
1335
1336
1337
1338
1339
1340
1341
1342 case 'a':
1343 return '\a', t, err
1344 case 'f':
1345 return '\f', t, err
1346 case 'n':
1347 return '\n', t, err
1348 case 'r':
1349 return '\r', t, err
1350 case 't':
1351 return '\t', t, err
1352 case 'v':
1353 return '\v', t, err
1354 }
1355 return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
1356 }
1357
1358
1359
1360 func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
1361 if s == "" {
1362 return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
1363 }
1364
1365
1366
1367 if s[0] == '\\' {
1368 return p.parseEscape(s)
1369 }
1370
1371 return nextRune(s)
1372 }
1373
1374 type charGroup struct {
1375 sign int
1376 class []rune
1377 }
1378
1379
1380
1381
1382 func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
1383 if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
1384 return
1385 }
1386 g := perlGroup[s[0:2]]
1387 if g.sign == 0 {
1388 return
1389 }
1390 return p.appendGroup(r, g), s[2:]
1391 }
1392
1393
1394
1395
1396 func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
1397 if len(s) < 2 || s[0] != '[' || s[1] != ':' {
1398 return
1399 }
1400
1401 i := strings.Index(s[2:], ":]")
1402 if i < 0 {
1403 return
1404 }
1405 i += 2
1406 name, s := s[0:i+2], s[i+2:]
1407 g := posixGroup[name]
1408 if g.sign == 0 {
1409 return nil, "", &Error{ErrInvalidCharRange, name}
1410 }
1411 return p.appendGroup(r, g), s, nil
1412 }
1413
1414 func (p *parser) appendGroup(r []rune, g charGroup) []rune {
1415 if p.flags&FoldCase == 0 {
1416 if g.sign < 0 {
1417 r = appendNegatedClass(r, g.class)
1418 } else {
1419 r = appendClass(r, g.class)
1420 }
1421 } else {
1422 tmp := p.tmpClass[:0]
1423 tmp = appendFoldedClass(tmp, g.class)
1424 p.tmpClass = tmp
1425 tmp = cleanClass(&p.tmpClass)
1426 if g.sign < 0 {
1427 r = appendNegatedClass(r, tmp)
1428 } else {
1429 r = appendClass(r, tmp)
1430 }
1431 }
1432 return r
1433 }
1434
1435 var anyTable = &unicode.RangeTable{
1436 R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
1437 R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
1438 }
1439
1440
1441
1442 func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
1443
1444 if name == "Any" {
1445 return anyTable, anyTable
1446 }
1447 if t := unicode.Categories[name]; t != nil {
1448 return t, unicode.FoldCategory[name]
1449 }
1450 if t := unicode.Scripts[name]; t != nil {
1451 return t, unicode.FoldScript[name]
1452 }
1453 return nil, nil
1454 }
1455
1456
1457
1458
1459 func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
1460 if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
1461 return
1462 }
1463
1464
1465 sign := +1
1466 if s[1] == 'P' {
1467 sign = -1
1468 }
1469 t := s[2:]
1470 c, t, err := nextRune(t)
1471 if err != nil {
1472 return
1473 }
1474 var seq, name string
1475 if c != '{' {
1476
1477 seq = s[:len(s)-len(t)]
1478 name = seq[2:]
1479 } else {
1480
1481 end := strings.IndexRune(s, '}')
1482 if end < 0 {
1483 if err = checkUTF8(s); err != nil {
1484 return
1485 }
1486 return nil, "", &Error{ErrInvalidCharRange, s}
1487 }
1488 seq, t = s[:end+1], s[end+1:]
1489 name = s[3:end]
1490 if err = checkUTF8(name); err != nil {
1491 return
1492 }
1493 }
1494
1495
1496 if name != "" && name[0] == '^' {
1497 sign = -sign
1498 name = name[1:]
1499 }
1500
1501 tab, fold := unicodeTable(name)
1502 if tab == nil {
1503 return nil, "", &Error{ErrInvalidCharRange, seq}
1504 }
1505
1506 if p.flags&FoldCase == 0 || fold == nil {
1507 if sign > 0 {
1508 r = appendTable(r, tab)
1509 } else {
1510 r = appendNegatedTable(r, tab)
1511 }
1512 } else {
1513
1514
1515
1516 tmp := p.tmpClass[:0]
1517 tmp = appendTable(tmp, tab)
1518 tmp = appendTable(tmp, fold)
1519 p.tmpClass = tmp
1520 tmp = cleanClass(&p.tmpClass)
1521 if sign > 0 {
1522 r = appendClass(r, tmp)
1523 } else {
1524 r = appendNegatedClass(r, tmp)
1525 }
1526 }
1527 return r, t, nil
1528 }
1529
1530
1531
1532 func (p *parser) parseClass(s string) (rest string, err error) {
1533 t := s[1:]
1534 re := p.newRegexp(OpCharClass)
1535 re.Flags = p.flags
1536 re.Rune = re.Rune0[:0]
1537
1538 sign := +1
1539 if t != "" && t[0] == '^' {
1540 sign = -1
1541 t = t[1:]
1542
1543
1544
1545 if p.flags&ClassNL == 0 {
1546 re.Rune = append(re.Rune, '\n', '\n')
1547 }
1548 }
1549
1550 class := re.Rune
1551 first := true
1552 for t == "" || t[0] != ']' || first {
1553
1554
1555 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
1556 _, size := utf8.DecodeRuneInString(t[1:])
1557 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
1558 }
1559 first = false
1560
1561
1562 if len(t) > 2 && t[0] == '[' && t[1] == ':' {
1563 nclass, nt, err := p.parseNamedClass(t, class)
1564 if err != nil {
1565 return "", err
1566 }
1567 if nclass != nil {
1568 class, t = nclass, nt
1569 continue
1570 }
1571 }
1572
1573
1574 nclass, nt, err := p.parseUnicodeClass(t, class)
1575 if err != nil {
1576 return "", err
1577 }
1578 if nclass != nil {
1579 class, t = nclass, nt
1580 continue
1581 }
1582
1583
1584 if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
1585 class, t = nclass, nt
1586 continue
1587 }
1588
1589
1590 rng := t
1591 var lo, hi rune
1592 if lo, t, err = p.parseClassChar(t, s); err != nil {
1593 return "", err
1594 }
1595 hi = lo
1596
1597 if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
1598 t = t[1:]
1599 if hi, t, err = p.parseClassChar(t, s); err != nil {
1600 return "", err
1601 }
1602 if hi < lo {
1603 rng = rng[:len(rng)-len(t)]
1604 return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
1605 }
1606 }
1607 if p.flags&FoldCase == 0 {
1608 class = appendRange(class, lo, hi)
1609 } else {
1610 class = appendFoldedRange(class, lo, hi)
1611 }
1612 }
1613 t = t[1:]
1614
1615
1616 re.Rune = class
1617 class = cleanClass(&re.Rune)
1618 if sign < 0 {
1619 class = negateClass(class)
1620 }
1621 re.Rune = class
1622 p.push(re)
1623 return t, nil
1624 }
1625
1626
1627
1628 func cleanClass(rp *[]rune) []rune {
1629
1630
1631 sort.Sort(ranges{rp})
1632
1633 r := *rp
1634 if len(r) < 2 {
1635 return r
1636 }
1637
1638
1639 w := 2
1640 for i := 2; i < len(r); i += 2 {
1641 lo, hi := r[i], r[i+1]
1642 if lo <= r[w-1]+1 {
1643
1644 if hi > r[w-1] {
1645 r[w-1] = hi
1646 }
1647 continue
1648 }
1649
1650 r[w] = lo
1651 r[w+1] = hi
1652 w += 2
1653 }
1654
1655 return r[:w]
1656 }
1657
1658
1659 func appendLiteral(r []rune, x rune, flags Flags) []rune {
1660 if flags&FoldCase != 0 {
1661 return appendFoldedRange(r, x, x)
1662 }
1663 return appendRange(r, x, x)
1664 }
1665
1666
1667 func appendRange(r []rune, lo, hi rune) []rune {
1668
1669
1670
1671
1672 n := len(r)
1673 for i := 2; i <= 4; i += 2 {
1674 if n >= i {
1675 rlo, rhi := r[n-i], r[n-i+1]
1676 if lo <= rhi+1 && rlo <= hi+1 {
1677 if lo < rlo {
1678 r[n-i] = lo
1679 }
1680 if hi > rhi {
1681 r[n-i+1] = hi
1682 }
1683 return r
1684 }
1685 }
1686 }
1687
1688 return append(r, lo, hi)
1689 }
1690
1691 const (
1692
1693
1694 minFold = 0x0041
1695 maxFold = 0x1e943
1696 )
1697
1698
1699
1700 func appendFoldedRange(r []rune, lo, hi rune) []rune {
1701
1702 if lo <= minFold && hi >= maxFold {
1703
1704 return appendRange(r, lo, hi)
1705 }
1706 if hi < minFold || lo > maxFold {
1707
1708 return appendRange(r, lo, hi)
1709 }
1710 if lo < minFold {
1711
1712 r = appendRange(r, lo, minFold-1)
1713 lo = minFold
1714 }
1715 if hi > maxFold {
1716
1717 r = appendRange(r, maxFold+1, hi)
1718 hi = maxFold
1719 }
1720
1721
1722 for c := lo; c <= hi; c++ {
1723 r = appendRange(r, c, c)
1724 f := unicode.SimpleFold(c)
1725 for f != c {
1726 r = appendRange(r, f, f)
1727 f = unicode.SimpleFold(f)
1728 }
1729 }
1730 return r
1731 }
1732
1733
1734
1735 func appendClass(r []rune, x []rune) []rune {
1736 for i := 0; i < len(x); i += 2 {
1737 r = appendRange(r, x[i], x[i+1])
1738 }
1739 return r
1740 }
1741
1742
1743 func appendFoldedClass(r []rune, x []rune) []rune {
1744 for i := 0; i < len(x); i += 2 {
1745 r = appendFoldedRange(r, x[i], x[i+1])
1746 }
1747 return r
1748 }
1749
1750
1751
1752 func appendNegatedClass(r []rune, x []rune) []rune {
1753 nextLo := '\u0000'
1754 for i := 0; i < len(x); i += 2 {
1755 lo, hi := x[i], x[i+1]
1756 if nextLo <= lo-1 {
1757 r = appendRange(r, nextLo, lo-1)
1758 }
1759 nextLo = hi + 1
1760 }
1761 if nextLo <= unicode.MaxRune {
1762 r = appendRange(r, nextLo, unicode.MaxRune)
1763 }
1764 return r
1765 }
1766
1767
1768 func appendTable(r []rune, x *unicode.RangeTable) []rune {
1769 for _, xr := range x.R16 {
1770 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1771 if stride == 1 {
1772 r = appendRange(r, lo, hi)
1773 continue
1774 }
1775 for c := lo; c <= hi; c += stride {
1776 r = appendRange(r, c, c)
1777 }
1778 }
1779 for _, xr := range x.R32 {
1780 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1781 if stride == 1 {
1782 r = appendRange(r, lo, hi)
1783 continue
1784 }
1785 for c := lo; c <= hi; c += stride {
1786 r = appendRange(r, c, c)
1787 }
1788 }
1789 return r
1790 }
1791
1792
1793 func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
1794 nextLo := '\u0000'
1795 for _, xr := range x.R16 {
1796 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1797 if stride == 1 {
1798 if nextLo <= lo-1 {
1799 r = appendRange(r, nextLo, lo-1)
1800 }
1801 nextLo = hi + 1
1802 continue
1803 }
1804 for c := lo; c <= hi; c += stride {
1805 if nextLo <= c-1 {
1806 r = appendRange(r, nextLo, c-1)
1807 }
1808 nextLo = c + 1
1809 }
1810 }
1811 for _, xr := range x.R32 {
1812 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1813 if stride == 1 {
1814 if nextLo <= lo-1 {
1815 r = appendRange(r, nextLo, lo-1)
1816 }
1817 nextLo = hi + 1
1818 continue
1819 }
1820 for c := lo; c <= hi; c += stride {
1821 if nextLo <= c-1 {
1822 r = appendRange(r, nextLo, c-1)
1823 }
1824 nextLo = c + 1
1825 }
1826 }
1827 if nextLo <= unicode.MaxRune {
1828 r = appendRange(r, nextLo, unicode.MaxRune)
1829 }
1830 return r
1831 }
1832
1833
1834
1835 func negateClass(r []rune) []rune {
1836 nextLo := '\u0000'
1837 w := 0
1838 for i := 0; i < len(r); i += 2 {
1839 lo, hi := r[i], r[i+1]
1840 if nextLo <= lo-1 {
1841 r[w] = nextLo
1842 r[w+1] = lo - 1
1843 w += 2
1844 }
1845 nextLo = hi + 1
1846 }
1847 r = r[:w]
1848 if nextLo <= unicode.MaxRune {
1849
1850
1851 r = append(r, nextLo, unicode.MaxRune)
1852 }
1853 return r
1854 }
1855
1856
1857
1858
1859
1860 type ranges struct {
1861 p *[]rune
1862 }
1863
1864 func (ra ranges) Less(i, j int) bool {
1865 p := *ra.p
1866 i *= 2
1867 j *= 2
1868 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
1869 }
1870
1871 func (ra ranges) Len() int {
1872 return len(*ra.p) / 2
1873 }
1874
1875 func (ra ranges) Swap(i, j int) {
1876 p := *ra.p
1877 i *= 2
1878 j *= 2
1879 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
1880 }
1881
1882 func checkUTF8(s string) error {
1883 for s != "" {
1884 rune, size := utf8.DecodeRuneInString(s)
1885 if rune == utf8.RuneError && size == 1 {
1886 return &Error{Code: ErrInvalidUTF8, Expr: s}
1887 }
1888 s = s[size:]
1889 }
1890 return nil
1891 }
1892
1893 func nextRune(s string) (c rune, t string, err error) {
1894 c, size := utf8.DecodeRuneInString(s)
1895 if c == utf8.RuneError && size == 1 {
1896 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
1897 }
1898 return c, s[size:], nil
1899 }
1900
1901 func isalnum(c rune) bool {
1902 return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
1903 }
1904
1905 func unhex(c rune) rune {
1906 if '0' <= c && c <= '9' {
1907 return c - '0'
1908 }
1909 if 'a' <= c && c <= 'f' {
1910 return c - 'a' + 10
1911 }
1912 if 'A' <= c && c <= 'F' {
1913 return c - 'A' + 10
1914 }
1915 return -1
1916 }
1917
View as plain text