View Javadoc
1   /*
2    * Copyright (c) 2002-2017 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package com.gargoylesoftware.htmlunit.html;
16  
17  import java.util.Iterator;
18  import java.util.List;
19  import java.util.regex.Pattern;
20  
21  import org.apache.commons.lang3.StringUtils;
22  
23  import com.gargoylesoftware.htmlunit.Page;
24  import com.gargoylesoftware.htmlunit.SgmlPage;
25  import com.gargoylesoftware.htmlunit.javascript.host.Element;
26  
27  /**
28   * Utility to handle conversion from HTML code to string.
29   * TODO: simplify it (it is just copied from what was available in DomNode and subclasses).
30   *
31   * @author Marc Guillemot
32   * @author Ahmed Ashour
33   * @author Ronald Brill
34   * @author Rob Kodey
35   */
36  class HtmlSerializer {
37      private final StringBuilder builder_ = new StringBuilder();
38      /** Indicates a block. Will be rendered as line separator (multiple block marks are ignored) */
39      protected static final String AS_TEXT_BLOCK_SEPARATOR = "§bs§";
40      private static final int AS_TEXT_BLOCK_SEPARATOR_LENGTH = AS_TEXT_BLOCK_SEPARATOR.length();
41  
42      /** Indicates a new line. Will be rendered as line separator. */
43      protected static final String AS_TEXT_NEW_LINE = "§nl§";
44      private static final int AS_TEXT_NEW_LINE_LENGTH = AS_TEXT_NEW_LINE.length();
45  
46      /** Indicates a non blank that can't be trimmed or reduced. */
47      protected static final String AS_TEXT_BLANK = "§blank§";
48      /** Indicates a tab. */
49      protected static final String AS_TEXT_TAB = "§tab§";
50  
51      private static final Pattern TEXT_AREA_PATTERN = Pattern.compile("\r?\n");
52  
53      private boolean appletEnabled_;
54      private boolean ignoreMaskedElements_ = true;
55  
56      /**
57       * Converts an HTML node to text.
58       * @param node a node
59       * @return the text representation according to the setting of this serializer
60       */
61      public String asText(final DomNode node) {
62          appletEnabled_ = node.getPage().getWebClient().getOptions().isAppletEnabled();
63          builder_.setLength(0);
64          appendNode(node);
65          final String response = builder_.toString();
66          builder_.setLength(0);
67          return cleanUp(response);
68      }
69  
70      protected String cleanUp(String text) {
71          // ignore <br/> at the end of a block
72          text = reduceWhitespace(text);
73          text = StringUtils.replace(text, AS_TEXT_BLANK, " ");
74          final String ls = System.lineSeparator();
75          text = StringUtils.replace(text, AS_TEXT_NEW_LINE, ls);
76          // text = CLEAN_UP_PATTERN.matcher(text).replaceAll(ls); // many block sep => 1 new line
77          text = StringUtils.replace(text, AS_TEXT_BLOCK_SEPARATOR, ls);
78          text = StringUtils.replace(text, AS_TEXT_TAB, "\t");
79  
80          return text;
81      }
82  
83      private static String reduceWhitespace(String text) {
84          text = trim(text);
85  
86          // remove white spaces before or after block separators
87          text = reduceWhiteSpaceAroundBlockSeparator(text);
88  
89          // remove leading block separators
90          while (text.startsWith(AS_TEXT_BLOCK_SEPARATOR)) {
91              text = text.substring(AS_TEXT_BLOCK_SEPARATOR_LENGTH);
92          }
93  
94          // remove trailing block separators
95          while (text.endsWith(AS_TEXT_BLOCK_SEPARATOR)) {
96              text = text.substring(0, text.length() - AS_TEXT_BLOCK_SEPARATOR_LENGTH);
97          }
98          text = trim(text);
99  
100         final StringBuilder builder = new StringBuilder(text.length());
101 
102         boolean whitespace = false;
103         for (final char ch : text.toCharArray()) {
104 
105             // Translate non-breaking space to regular space.
106             if (ch == (char) 160) {
107                 builder.append(' ');
108                 whitespace = false;
109             }
110             else {
111                 if (whitespace) {
112                     if (!isSpace(ch)) {
113                         builder.append(ch);
114                         whitespace = false;
115                     }
116                 }
117                 else {
118                     if (isSpace(ch)) {
119                         whitespace = true;
120                         builder.append(' ');
121                     }
122                     else {
123                         builder.append(ch);
124                     }
125                 }
126             }
127         }
128         return builder.toString();
129     }
130 
131     private static boolean isSpace(final char ch) {
132         return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\f' || ch == '\r';
133     }
134 
135     private static String trim(String string) {
136         int length = string.length();
137 
138         int start = 0;
139         while (start != length && isSpace(string.charAt(start))) {
140             start++;
141         }
142         if (start != 0) {
143             string = string.substring(start);
144             length = string.length();
145         }
146 
147         if (length != 0) {
148             int end = length;
149             while (end != 0 && isSpace(string.charAt(end - 1))) {
150                 end--;
151             }
152             if (end != length) {
153                 string = string.substring(0, end);
154             }
155         }
156 
157         return string;
158     }
159 
160     private static String reduceWhiteSpaceAroundBlockSeparator(final String text) {
161         int p0 = text.indexOf(AS_TEXT_BLOCK_SEPARATOR);
162         if (p0 == -1) {
163             return text;
164         }
165 
166         final int length = text.length();
167         if (length <= AS_TEXT_BLOCK_SEPARATOR_LENGTH) {
168             return text;
169         }
170 
171         final StringBuilder result = new StringBuilder(length);
172         int start = 0;
173         while (p0 != -1) {
174             int p1 = p0 + AS_TEXT_BLOCK_SEPARATOR_LENGTH;
175             while (p0 != start && isSpace(text.charAt(p0 - 1))) {
176                 p0--;
177             }
178             if (p0 >= AS_TEXT_NEW_LINE_LENGTH && text.startsWith(AS_TEXT_NEW_LINE, p0 - AS_TEXT_NEW_LINE_LENGTH)) {
179                 p0 = p0 - AS_TEXT_NEW_LINE_LENGTH;
180             }
181             result.append(text.substring(start, p0)).append(AS_TEXT_BLOCK_SEPARATOR);
182 
183             while (p1 < length && isSpace(text.charAt(p1))) {
184                 p1++;
185             }
186             start = p1;
187 
188             // ignore duplicates
189             p0 = text.indexOf(AS_TEXT_BLOCK_SEPARATOR, start);
190             while (p0 != -1 && p0 == start) {
191                 start += AS_TEXT_BLOCK_SEPARATOR_LENGTH;
192                 p0 = text.indexOf(AS_TEXT_BLOCK_SEPARATOR, start);
193             }
194         }
195         if (start < length) {
196             result.append(text.substring(start));
197         }
198         return result.toString();
199     }
200 
201     protected void appendNode(final DomNode node) {
202         if (node instanceof DomText) {
203             appendText((DomText) node);
204         }
205         else if (node instanceof DomComment) {
206             // nothing to do
207         }
208         else if (node instanceof HtmlApplet && appletEnabled_) {
209             // nothing
210         }
211         else if (node instanceof HtmlBreak) {
212             doAppendNewLine();
213         }
214         else if (node instanceof HtmlHiddenInput
215                 || node instanceof HtmlScript
216                 || node instanceof HtmlStyle
217                 || node instanceof HtmlNoFrames) {
218             // nothing
219         }
220         else if (node instanceof HtmlTextArea) {
221             appendHtmlTextArea((HtmlTextArea) node);
222         }
223         else if (node instanceof HtmlTitle) {
224             appendHtmlTitle((HtmlTitle) node);
225         }
226         else if (node instanceof HtmlTableRow) {
227             appendHtmlTableRow((HtmlTableRow) node);
228         }
229         else if (node instanceof HtmlSelect) {
230             appendHtmlSelect((HtmlSelect) node);
231         }
232         else if (node instanceof HtmlSubmitInput) {
233             doAppend(((HtmlSubmitInput) node).asText());
234         }
235         else if (node instanceof HtmlResetInput) {
236             doAppend(((HtmlResetInput) node).asText());
237         }
238         else if (node instanceof HtmlCheckBoxInput) {
239             final String str;
240             if (((HtmlCheckBoxInput) node).isChecked()) {
241                 str = "checked";
242             }
243             else {
244                 str = "unchecked";
245             }
246             doAppend(str);
247         }
248         else if (node instanceof HtmlRadioButtonInput) {
249             final String str;
250             if (((HtmlRadioButtonInput) node).isChecked()) {
251                 str = "checked";
252             }
253             else {
254                 str = "unchecked";
255             }
256             doAppend(str);
257         }
258         else if (node instanceof HtmlInput) {
259             doAppend(((HtmlInput) node).getValueAttribute());
260         }
261         else if (node instanceof HtmlTable) {
262             appendHtmlTable((HtmlTable) node);
263         }
264         else if (node instanceof HtmlOrderedList) {
265             appendHtmlOrderedList((HtmlOrderedList) node);
266         }
267         else if (node instanceof HtmlUnorderedList) {
268             appendHtmlUnorderedList((HtmlUnorderedList) node);
269         }
270         else if (node instanceof HtmlPreformattedText) {
271             appendHtmlPreformattedText((HtmlPreformattedText) node);
272         }
273         else if (node instanceof HtmlInlineFrame) {
274             appendHtmlInlineFrame((HtmlInlineFrame) node);
275         }
276         else if (node instanceof HtmlNoScript && node.getPage().getWebClient().getOptions().isJavaScriptEnabled()) {
277             return;
278         }
279         else {
280             final boolean block;
281             final Object scriptableObject = node.getScriptableObject();
282             if (node instanceof HtmlBody) {
283                 block = false;
284             }
285             else if (scriptableObject instanceof Element) {
286                 final Element element = (Element) scriptableObject;
287                 final String display = element.getWindow().getComputedStyle(element, null).getDisplay(true);
288                 block = "block".equals(display);
289             }
290             else {
291                 block = false;
292             }
293 
294             if (block) {
295                 doAppendBlockSeparator();
296             }
297             appendChildren(node);
298             if (block) {
299                 doAppendBlockSeparator();
300             }
301         }
302     }
303 
304     private void doAppendBlockSeparator() {
305         builder_.append(AS_TEXT_BLOCK_SEPARATOR);
306     }
307 
308     private void doAppend(final String str) {
309         builder_.append(str);
310     }
311 
312     private void doAppendNewLine() {
313         builder_.append(AS_TEXT_NEW_LINE);
314     }
315 
316     private void doAppendTab() {
317         builder_.append(AS_TEXT_TAB);
318     }
319 
320     private void appendHtmlUnorderedList(final HtmlUnorderedList htmlUnorderedList) {
321         doAppendBlockSeparator();
322         boolean first = true;
323         for (final DomNode item : htmlUnorderedList.getChildren()) {
324             if (!first) {
325                 doAppendBlockSeparator();
326             }
327             first = false;
328             appendNode(item);
329         }
330         doAppendBlockSeparator();
331     }
332 
333     private void appendHtmlTitle(final HtmlTitle htmlTitle) {
334         // optimized version
335         // for the title there is no need to check the visibility
336         // of the containing dom text;
337         // this optimization defers the load of the style sheets
338         final DomNode child = htmlTitle.getFirstChild();
339         if (child instanceof DomText) {
340             doAppend(((DomText) child).getData());
341             doAppendBlockSeparator();
342             return;
343         }
344     }
345 
346     private void appendChildren(final DomNode node) {
347         for (final DomNode child : node.getChildren()) {
348             appendNode(child);
349         }
350     }
351 
352     private void appendHtmlTableRow(final HtmlTableRow htmlTableRow) {
353         boolean first = true;
354         for (final HtmlTableCell cell : htmlTableRow.getCells()) {
355             if (!first) {
356                 doAppendTab();
357             }
358             else {
359                 first = false;
360             }
361             appendChildren(cell); // trim?
362         }
363     }
364 
365     private void appendHtmlTextArea(final HtmlTextArea htmlTextArea) {
366         if (isVisible(htmlTextArea)) {
367             String text = htmlTextArea.getText();
368             text = StringUtils.stripEnd(text, null);
369             text = TEXT_AREA_PATTERN.matcher(text).replaceAll(AS_TEXT_NEW_LINE);
370             text = StringUtils.replace(text, "\r", AS_TEXT_NEW_LINE);
371             text = StringUtils.replace(text, " ", AS_TEXT_BLANK);
372             doAppend(text);
373         }
374     }
375 
376     private void appendHtmlTable(final HtmlTable htmlTable) {
377         doAppendBlockSeparator();
378         final String caption = htmlTable.getCaptionText();
379         if (caption != null) {
380             doAppend(caption);
381             doAppendBlockSeparator();
382         }
383 
384         boolean first = true;
385 
386         // first thead has to be displayed first and first tfoot has to be displayed last
387         final HtmlTableHeader tableHeader = htmlTable.getHeader();
388         if (tableHeader != null) {
389             first = appendHtmlTableRows(tableHeader.getRows(), true, null, null);
390         }
391         final HtmlTableFooter tableFooter = htmlTable.getFooter();
392 
393         final List<HtmlTableRow> tableRows = htmlTable.getRows();
394         first = appendHtmlTableRows(tableRows, first, tableHeader, tableFooter);
395 
396         if (tableFooter != null) {
397             first = appendHtmlTableRows(tableFooter.getRows(), first, null, null);
398         }
399         else if (tableRows.isEmpty()) {
400             final DomNode firstChild = htmlTable.getFirstChild();
401             if (firstChild != null) {
402                 appendNode(firstChild);
403             }
404         }
405 
406         doAppendBlockSeparator();
407     }
408 
409     private boolean appendHtmlTableRows(final List<HtmlTableRow> rows, boolean first, final TableRowGroup skipParent1,
410             final TableRowGroup skipParent2) {
411         for (final HtmlTableRow row : rows) {
412             if (row.getParentNode() == skipParent1 || row.getParentNode() == skipParent2) {
413                 continue;
414             }
415             if (!first) {
416                 doAppendBlockSeparator();
417             }
418             first = false;
419             appendHtmlTableRow(row);
420         }
421         return first;
422     }
423 
424     /**
425      * @param htmlSelect
426      */
427     private void appendHtmlSelect(final HtmlSelect htmlSelect) {
428         final List<HtmlOption> options;
429         if (htmlSelect.isMultipleSelectEnabled()) {
430             options = htmlSelect.getOptions();
431         }
432         else {
433             options = htmlSelect.getSelectedOptions();
434         }
435 
436         for (final Iterator<HtmlOption> i = options.iterator(); i.hasNext();) {
437             final HtmlOption currentOption = i.next();
438             appendNode(currentOption);
439             if (i.hasNext()) {
440                 doAppendBlockSeparator();
441             }
442         }
443     }
444 
445     /**
446      * Appends a {@code <ol>} taking care to numerate it.
447      * @param htmlOrderedList the OL element
448      */
449     private void appendHtmlOrderedList(final HtmlOrderedList htmlOrderedList) {
450         doAppendBlockSeparator();
451         boolean first = true;
452         int i = 1;
453         for (final DomNode item : htmlOrderedList.getChildren()) {
454             if (!first) {
455                 doAppendBlockSeparator();
456             }
457             first = false;
458             if (item instanceof HtmlListItem) {
459                 doAppend(Integer.toString(i++));
460                 doAppend(". ");
461                 appendChildren(item);
462             }
463             else {
464                 appendNode(item);
465             }
466         }
467         doAppendBlockSeparator();
468     }
469 
470     private void appendHtmlPreformattedText(final HtmlPreformattedText htmlPreformattedText) {
471         if (isVisible(htmlPreformattedText)) {
472             doAppendBlockSeparator();
473             String text = htmlPreformattedText.getTextContent();
474             text = StringUtils.replace(text, "\t", AS_TEXT_TAB);
475             text = StringUtils.replace(text, " ", AS_TEXT_BLANK);
476             text = TEXT_AREA_PATTERN.matcher(text).replaceAll(AS_TEXT_NEW_LINE);
477             text = StringUtils.replace(text, "\r", AS_TEXT_NEW_LINE);
478             doAppend(text);
479             doAppendBlockSeparator();
480         }
481     }
482 
483     private void appendHtmlInlineFrame(final HtmlInlineFrame htmlInlineFrame) {
484         if (isVisible(htmlInlineFrame)) {
485             doAppendBlockSeparator();
486             final Page page = htmlInlineFrame.getEnclosedPage();
487             if (page instanceof SgmlPage) {
488                 doAppend(((SgmlPage) page).asText());
489             }
490             doAppendBlockSeparator();
491         }
492     }
493 
494     private void appendText(final DomText domText) {
495         final DomNode parent = domText.getParentNode();
496         if (parent == null || parent instanceof HtmlTitle || isVisible(parent)) {
497             append(domText.getData());
498         }
499     }
500 
501     private boolean isVisible(final DomNode node) {
502         return !ignoreMaskedElements_ || node.isDisplayed();
503     }
504 
505     /**
506      * Indicates if element that are not displayed due to style settings
507      * (visibility or display) should be visible in generated text.
508      * @param ignore indicates if masked elements should be ignored or not
509      */
510     public void setIgnoreMaskedElements(final boolean ignore) {
511         ignoreMaskedElements_ = ignore;
512     }
513 
514     private void append(final String text) {
515         doAppend(text);
516     }
517 }