View Javadoc
1   /*
2    * Copyright (c) 2002-2017 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package com.gargoylesoftware.htmlunit.javascript.regexp;
16  
17  import java.util.LinkedList;
18  import java.util.List;
19  import java.util.Stack;
20  
21  /**
22   * Translates JavaScript RegExp to Java RegExp.<br>
23   * // [...\b...] to [...\cH...]
24   * // [...[...] to [...\[...]
25   * // [^\\1] to .
26   * // back reference in character classes are simply ignored by browsers [...ab\5cd...] to [...abcd...]
27   * // characters escaped without need should be "un-escaped"
28   * Escape curly braces that are not used in an expression
29   * like "{n}", "{n,}" or "{n,m}" (where n and m are positive integers).
30   *
31   * @author Ronald Brill
32   * @author Leszek Hoppe
33   */
34  public class RegExpJsToJavaConverter {
35  
36      private static final String DIGITS = "0123456789";
37  
38      /**
39       * Helper to encapsulate the transformations.
40       */
41      private static class Tape {
42          private final StringBuilder tape_;
43          private int currentPos_;
44  
45          /**
46           * Wraps a JavaScript RegExp to access it char by char like a tape.
47           *
48           * @param input the Javascript RegExp
49           */
50          Tape(final String input) {
51              currentPos_ = 0;
52              tape_ = new StringBuilder(input);
53          }
54  
55          /**
56           * Moves the current read position by offset.
57           * @param offset the move position offset
58           */
59          public void move(final int offset) {
60              currentPos_ += offset;
61          }
62  
63          /**
64           * Reads the character at the current position and
65           * moves the read position by 1.
66           *
67           * @return the character at current position
68           */
69          public int read() {
70              if (currentPos_ < 0) {
71                  return -1;
72              }
73              if (currentPos_ >= tape_.length()) {
74                  return -1;
75              }
76  
77              return tape_.charAt(currentPos_++);
78          }
79  
80          /**
81           * Inserts a string at the current position + offset.
82           * @param token the string to insert
83           * @param offset the move position offset
84           */
85          public void insert(final String token, final int offset) {
86              tape_.insert(currentPos_ + offset, token);
87              currentPos_ += token.length();
88          }
89  
90          /**
91           * Inserts a string at the given pos.
92           * @param token the string to insert
93           * @param pos the move position offset
94           */
95          public void insertAt(final String token, final int pos) {
96              tape_.insert(pos, token);
97              currentPos_ += token.length();
98          }
99  
100         /**
101          * Replaces the current char with the given string.
102          * @param token the string to insert
103          */
104         public void replace(final String token) {
105             tape_.replace(currentPos_, currentPos_ + 1, token);
106         }
107 
108         /**
109          * Read the whole tape content.
110          *
111          * @return tape content
112          */
113         @Override
114         public String toString() {
115             return tape_.toString();
116         }
117     }
118 
119     private static final class Subexpresion {
120         private boolean closed_;
121         private boolean optional_;
122         private boolean enhanced_;
123         private int start_;
124         private int end_;
125 
126         private Subexpresion() {
127             closed_ = false;
128             optional_ = false;
129             enhanced_ = false;
130             start_ = -1;
131             end_ = -1;
132         }
133     }
134 
135     private Tape tape_;
136     private boolean insideCharClass_;
137     private boolean insideRepetition_;
138     private Stack<Subexpresion> parsingSubexpressions_;
139     private List<Subexpresion> subexpressions_;
140 
141     /**
142      * Initiate the FSM.
143      */
144     public RegExpJsToJavaConverter() {
145         super();
146     }
147 
148     /**
149      * Run the state machine on a given input string.
150      *
151      * @param input
152      *            the js regexp to process
153      * @return a valid java regex pattern
154      */
155     public String convert(final String input) {
156         tape_ = new Tape(input);
157         insideCharClass_ = false;
158         insideRepetition_ = false;
159 
160         parsingSubexpressions_ = new Stack<>();
161         subexpressions_ = new LinkedList<>();
162 
163         int current = tape_.read();
164         while (current > -1) {
165             if ('\\' == current) {
166                 processEscapeSequence();
167             }
168             else if ('[' == current) {
169                 processCharClassStart();
170             }
171             else if (']' == current) {
172                 processCharClassEnd();
173             }
174             else if ('{' == current) {
175                 processRepetitionStart();
176             }
177             else if ('}' == current) {
178                 processRepetitionEnd();
179             }
180             else if ('(' == current) {
181                 processSubExpressionStart();
182             }
183             else if (')' == current) {
184                 processSubExpressionEnd();
185             }
186 
187             // process next
188             current = tape_.read();
189         }
190 
191         return tape_.toString();
192     }
193 
194     private void processCharClassStart() {
195         if (insideCharClass_) {
196             tape_.insert("\\", -1);
197         }
198         else {
199             insideCharClass_ = true;
200 
201             int next = tape_.read();
202             if (next < 0) {
203                 tape_.insert("\\", -1);
204                 return;
205             }
206             if ('^' == next) {
207                 // [^\2]
208                 next = tape_.read();
209                 if (next < 0) {
210                     tape_.insert("\\", -2);
211                     return;
212                 }
213                 if ('\\' == next) {
214                     next = tape_.read();
215                     if (DIGITS.indexOf(next) < 0) {
216                         tape_.move(-2);
217                         return;
218                     }
219                     // if it was a back reference then we have to check more
220                     if (handleBackReferenceOrOctal(next)) {
221                         next = tape_.read();
222                         if (']' == next) {
223                             tape_.move(-3);
224                             tape_.replace("");
225                             tape_.replace("");
226                             tape_.replace(".");
227                             insideCharClass_ = false;
228                         }
229                     }
230                 }
231                 else {
232                     tape_.move(-1);
233                 }
234             }
235             else {
236                 tape_.move(-1);
237             }
238         }
239     }
240 
241     private void processCharClassEnd() {
242         insideCharClass_ = false;
243     }
244 
245     private void processRepetitionStart() {
246         final int next = tape_.read();
247         if (next < 0) {
248             tape_.insert("\\", -1);
249             return;
250         }
251 
252         if (DIGITS.indexOf(next) > -1) {
253             insideRepetition_ = true;
254         }
255         else {
256             tape_.insert("\\", -2);
257             tape_.move(-1);
258         }
259     }
260 
261     private void processRepetitionEnd() {
262         if (insideRepetition_) {
263             insideRepetition_ = false;
264             return;
265         }
266 
267         tape_.insert("\\", -1);
268     }
269 
270     private void processSubExpressionStart() {
271         int next = tape_.read();
272         if (next < 0) {
273             return;
274         }
275 
276         if ('?' != next) {
277             final Subexpresion sub = new Subexpresion();
278             sub.start_ = tape_.currentPos_;
279             parsingSubexpressions_.push(sub);
280             subexpressions_.add(sub);
281 
282             tape_.move(-1);
283             return;
284         }
285 
286         next = tape_.read();
287         if (next < 0) {
288             return;
289         }
290         if (':' != next) {
291             final Subexpresion sub = new Subexpresion();
292             sub.start_ = tape_.currentPos_;
293             parsingSubexpressions_.push(sub);
294             subexpressions_.add(sub);
295 
296             tape_.move(-1);
297             return;
298         }
299 
300         final Subexpresion sub = new Subexpresion();
301         sub.start_ = tape_.currentPos_;
302         parsingSubexpressions_.push(sub);
303     }
304 
305     private void processSubExpressionEnd() {
306         if (parsingSubexpressions_.isEmpty()) {
307             return;
308         }
309         final Subexpresion sub = parsingSubexpressions_.pop();
310         sub.closed_ = true;
311         sub.end_ = tape_.currentPos_;
312 
313         final int next = tape_.read();
314         sub.optional_ = '?' == next;
315         tape_.move(-1);
316     }
317 
318     private void processEscapeSequence() {
319         final int escapeSequence = tape_.read();
320         if (escapeSequence < 0) {
321             return;
322         }
323 
324         if ('x' == escapeSequence) {
325             // Hex code (e.g. \x00)
326             // Read the two char hex code
327             tape_.move(2);
328             return;
329         }
330 
331         if ('u' == escapeSequence) {
332             // Unicode (e.g. \u0009)
333             // Read the for char unicode
334             tape_.move(4);
335             return;
336         }
337 
338         if ("ACEFGHIJKLMNOPQRTUVXYZaeghijklmpqyz".indexOf(escapeSequence) > -1) {
339             // no need to escape this chars
340             tape_.move(-2);
341             tape_.replace("");
342             tape_.move(1);
343             return;
344         }
345 
346         if (insideCharClass_ && 'b' == escapeSequence) {
347             // [...\b...] -> [...\cH...]
348             tape_.move(-1);
349             tape_.replace("cH");
350             tape_.move(2);
351             return;
352         }
353 
354         if (DIGITS.indexOf(escapeSequence) > -1) {
355             handleBackReferenceOrOctal(escapeSequence);
356         }
357     }
358 
359     private boolean handleBackReferenceOrOctal(final int aFirstChar) {
360         // first try to figure out what the number is
361         final StringBuilder tmpNo = new StringBuilder(Character.toString((char) aFirstChar));
362         int tmpInsertPos = -1;
363         int next = tape_.read();
364         if (next > -1) {
365             if (DIGITS.indexOf(next) > -1) {
366                 tmpNo.append(next);
367                 tmpInsertPos--;
368                 next = tape_.read();
369                 if (next > -1) {
370                     if (DIGITS.indexOf(next) > -1) {
371                         tmpNo.append(next);
372                         tmpInsertPos--;
373                     }
374                     else {
375                         tape_.move(-1);
376                     }
377                 }
378             }
379             else {
380                 tape_.move(-1);
381                 if ('0' == aFirstChar) {
382                     // \0 has to be replaced by \x00
383                     tape_.insert("x0", -1);
384                     return false;
385                 }
386             }
387         }
388         else {
389             // if \0 is last character of pattern
390             if ('0' == aFirstChar) {
391                 // \0 has to be replaced by \x00
392                 tape_.insert("x0", -1);
393                 return false;
394             }
395         }
396 
397         if (tmpNo.charAt(0) == '0') {
398             // we have a octal here
399             return false;
400         }
401 
402         final int value = Integer.parseInt(tmpNo.toString());
403         if (value > subexpressions_.size()) {
404             // we have a octal here
405             tape_.insert("0", tmpInsertPos);
406             return false;
407         }
408 
409         // ignore invalid back references (inside char classes
410         // of if the referenced group is not (yet) available
411         if (insideCharClass_
412                 || (0 < value && value <= subexpressions_.size() && !subexpressions_.get(value - 1).closed_)
413                 || value > subexpressions_.size()) {
414             // drop back reference
415             for (int i = tmpInsertPos; i <= 0; i++) {
416                 tape_.move(-1);
417                 tape_.replace("");
418             }
419         }
420 
421         // ok it is a backreference
422         final Subexpresion back = subexpressions_.get(value - 1);
423         if (back.optional_ &&  !back.enhanced_) {
424             // change subexpression to make it java compatible
425             int insertPos = back.start_ - 1;
426             tape_.insertAt("(?:", insertPos);
427             for (Subexpresion subexp : subexpressions_) {
428                 if (subexp.start_ > insertPos) {
429                     subexp.start_ = subexp.start_ + 3;
430                 }
431                 if (subexp.end_ > insertPos) {
432                     subexp.end_ = subexp.end_ + 3;
433                 }
434             }
435 
436             insertPos = back.end_ + 1;
437             tape_.insertAt(")", insertPos);
438             for (Subexpresion subexp : subexpressions_) {
439                 if (subexp.start_ > insertPos) {
440                     subexp.start_ = subexp.start_ + 1;
441                 }
442                 if (subexp.end_ > insertPos) {
443                     subexp.end_ = subexp.end_ + 1;
444                 }
445             }
446         }
447         return true;
448     }
449 }