1   
2   
3   
4   
5   
6   
7   
8   package org.dom4j.io;
9   
10  import java.io.IOException;
11  import java.io.OutputStream;
12  import java.io.StringWriter;
13  import java.io.UnsupportedEncodingException;
14  import java.io.Writer;
15  import java.util.HashSet;
16  import java.util.Iterator;
17  import java.util.Set;
18  import java.util.Stack;
19  
20  import org.dom4j.Document;
21  import org.dom4j.DocumentHelper;
22  import org.dom4j.Element;
23  import org.dom4j.Entity;
24  import org.dom4j.Node;
25  
26  import org.xml.sax.SAXException;
27  
28  /***
29   * <p>
30   * <code>HTMLWriter</code> takes a DOM4J tree and formats it to a stream as
31   * HTML. This formatter is similar to XMLWriter but it outputs the text of CDATA
32   * and Entity sections rather than the serialised format as in XML, it has an
33   * XHTML mode, it retains whitespace in certain elements such as <PRE>,
34   * and it supports certain elements which have no corresponding close tag such
35   * as for <BR> and <P>.
36   * </p>
37   * 
38   * <p>
39   * The OutputFormat passed in to the constructor is checked for isXHTML() and
40   * isExpandEmptyElements(). See {@link OutputFormat OutputFormat}for details.
41   * Here are the rules for <b>this class </b> based on an OutputFormat, "format",
42   * passed in to the constructor: <br/><br/>
43   * 
44   * <ul>
45   * <li>If an element is in {@link #getOmitElementCloseSet()
46   * getOmitElementCloseSet}, then it is treated specially:
47   * 
48   * <ul>
49   * <li>It never expands, since some browsers treat this as two separate
50   * Horizontal Rules: <HR></HR></li>
51   * <li>If {@link org.dom4j.io.OutputFormat#isXHTML() format.isXHTML()}, then
52   * it has a space before the closing single-tag slash, since Netscape 4.x-
53   * treats this: <HR /> as an element named "HR" with an attribute named
54   * "/", but that's better than when it refuses to recognize this: <hr/>
55   * which it thinks is an element named "HR/".</li>
56   * </ul>
57   * 
58   * </li>
59   * <li>If {@link org.dom4j.io.OutputFormat#isXHTML() format.isXHTML()}, all
60   * elements must have either a close element, or be a closed single tag.</li>
61   * <li>If {@link org.dom4j.io.OutputFormat#isExpandEmptyElements()
62   * format.isExpandEmptyElements()}() is true, all elements are expanded except
63   * as above.</li>
64   * </ul>
65   * 
66   * <b>Examples </b>
67   * </p>
68   * 
69   * <p>
70   * </p>
71   * 
72   * <p>
73   * If isXHTML == true, CDATA sections look like this:
74   * 
75   * <PRE>
76   * 
77   * <b><myelement><![CDATA[My data]]></myelement> </b>
78   * 
79   * </PRE>
80   * 
81   * Otherwise, they look like this:
82   * 
83   * <PRE>
84   * 
85   * <b><myelement>My data</myelement> </b>
86   * 
87   * </PRE>
88   * 
89   * </p>
90   * 
91   * <p>
92   * Basically, {@link OutputFormat.isXHTML() OutputFormat.isXHTML()} ==
93   * <code>true</code> will produce valid XML, while {@link
94   * org.dom4j.io.OutputFormat#isExpandEmptyElements()
95   * format.isExpandEmptyElements()} determines whether empty elements are
96   * expanded if isXHTML is true, excepting the special HTML single tags.
97   * </p>
98   * 
99   * <p>
100  * Also, HTMLWriter handles tags whose contents should be preformatted, that is,
101  * whitespace-preserved. By default, this set includes the tags <PRE>,
102  * <SCRIPT>, <STYLE>, and <TEXTAREA>, case insensitively. It
103  * does not include <IFRAME>. Other tags, such as <CODE>,
104  * <KBD>, <TT>, <VAR>, are usually rendered in a different
105  * font in most browsers, but don't preserve whitespace, so they also don't
106  * appear in the default list. HTML Comments are always whitespace-preserved.
107  * However, the parser you use may store comments with linefeed-only text nodes
108  * (\n) even if your platform uses another line.separator character, and
109  * HTMLWriter outputs Comment nodes exactly as the DOM is set up by the parser.
110  * See examples and discussion here: {@link#setPreformattedTags(java.util.Set)
111  * setPreformattedTags}
112  * </p>
113  * 
114  * <p>
115  * <b>Examples </b>
116  * </p>
117  * <blockquote>
118  * <p>
119  * <b>Pretty Printing </b>
120  * </p>
121  * 
122  * <p>
123  * This example shows how to pretty print a string containing a valid HTML
124  * document to a string. You can also just call the static methods of this
125  * class: <br>
126  * {@link #prettyPrintHTML(String) prettyPrintHTML(String)}or <br>
127  * {@link #prettyPrintHTML(String,boolean,boolean,boolean,boolean)
128  * prettyPrintHTML(String,boolean,boolean,boolean,boolean)} or, <br>
129  * {@link #prettyPrintXHTML(String) prettyPrintXHTML(String)}for XHTML (note
130  * the X)
131  * </p>
132  * 
133  * <pre>
134  * String testPrettyPrint(String html) {
135  *     StringWriter sw = new StringWriter();
136  *     OutputFormat format = OutputFormat.createPrettyPrint();
137  *     // These are the default values for createPrettyPrint,
138  *     // so you needn't set them:
139  *     // format.setNewlines(true);
140  *     // format.setTrimText(true);</font>
141  *     format.setXHTML(true);
142  *     HTMLWriter writer = new HTMLWriter(sw, format);
143  *     Document document = DocumentHelper.parseText(html);
144  *     writer.write(document);
145  *     writer.flush();
146  *     return sw.toString();
147  * }
148  * </pre>
149  * 
150  * <p>
151  * This example shows how to create a "squeezed" document, but one that will
152  * work in browsers even if the browser line length is limited. No newlines are
153  * included, no extra whitespace at all, except where it it required by
154  * {@link #setPreformattedTags(java.util.Set) setPreformattedTags}.
155  * </p>
156  * 
157  * <pre>
158  * String testCrunch(String html) {
159  *     StringWriter sw = new StringWriter();
160  *     OutputFormat format = OutputFormat.createPrettyPrint();
161  *     format.setNewlines(false);
162  *     format.setTrimText(true);
163  *     format.setIndent("");
164  *     format.setXHTML(true);
165  *     format.setExpandEmptyElements(false);
166  *     format.setNewLineAfterNTags(20);
167  *     org.dom4j.io.HTMLWriter writer = new HTMLWriter(sw, format);
168  *     org.dom4j.Document document = DocumentHelper.parseText(html);
169  *     writer.write(document);
170  *     writer.flush();
171  *     return sw.toString();
172  * }
173  * </pre>
174  * 
175  * </blockquote>
176  * 
177  * @author <a href="mailto:james.strachan@metastuff.com">James Strachan </a>
178  * @author Laramie Crocker
179  * @version $Revision: 1.21 $
180  */
181 public class HTMLWriter extends XMLWriter {
182     private static String lineSeparator = System.getProperty("line.separator");
183 
184     protected static final HashSet DEFAULT_PREFORMATTED_TAGS;
185 
186     static {
187         
188         
189         DEFAULT_PREFORMATTED_TAGS = new HashSet();
190         DEFAULT_PREFORMATTED_TAGS.add("PRE");
191         DEFAULT_PREFORMATTED_TAGS.add("SCRIPT");
192         DEFAULT_PREFORMATTED_TAGS.add("STYLE");
193         DEFAULT_PREFORMATTED_TAGS.add("TEXTAREA");
194     }
195 
196     protected static final OutputFormat DEFAULT_HTML_FORMAT;
197 
198     static {
199         DEFAULT_HTML_FORMAT = new OutputFormat("  ", true);
200         DEFAULT_HTML_FORMAT.setTrimText(true);
201         DEFAULT_HTML_FORMAT.setSuppressDeclaration(true);
202     }
203 
204     private Stack formatStack = new Stack();
205 
206     private String lastText = "";
207 
208     private int tagsOuput = 0;
209 
210     
211     private int newLineAfterNTags = -1;
212 
213     private HashSet preformattedTags = DEFAULT_PREFORMATTED_TAGS;
214 
215     /***
216      * Used to store the qualified element names which should have no close
217      * element tag
218      */
219     private HashSet omitElementCloseSet;
220 
221     public HTMLWriter(Writer writer) {
222         super(writer, DEFAULT_HTML_FORMAT);
223     }
224 
225     public HTMLWriter(Writer writer, OutputFormat format) {
226         super(writer, format);
227     }
228 
229     public HTMLWriter() throws UnsupportedEncodingException {
230         super(DEFAULT_HTML_FORMAT);
231     }
232 
233     public HTMLWriter(OutputFormat format) throws UnsupportedEncodingException {
234         super(format);
235     }
236 
237     public HTMLWriter(OutputStream out) throws UnsupportedEncodingException {
238         super(out, DEFAULT_HTML_FORMAT);
239     }
240 
241     public HTMLWriter(OutputStream out, OutputFormat format)
242             throws UnsupportedEncodingException {
243         super(out, format);
244     }
245 
246     public void startCDATA() throws SAXException {
247     }
248 
249     public void endCDATA() throws SAXException {
250     }
251 
252     
253     
254     protected void writeCDATA(String text) throws IOException {
255         
256         
257         if (getOutputFormat().isXHTML()) {
258             super.writeCDATA(text);
259         } else {
260             writer.write(text);
261         }
262 
263         lastOutputNodeType = Node.CDATA_SECTION_NODE;
264     }
265 
266     protected void writeEntity(Entity entity) throws IOException {
267         writer.write(entity.getText());
268         lastOutputNodeType = Node.ENTITY_REFERENCE_NODE;
269     }
270 
271     protected void writeDeclaration() throws IOException {
272     }
273 
274     protected void writeString(String text) throws IOException {
275         
276 
277 
278 
279 
280 
281 
282 
283 
284 
285 
286 
287         if (text.equals("\n")) {
288             if (!formatStack.empty()) {
289                 super.writeString(lineSeparator);
290             }
291 
292             return;
293         }
294 
295         lastText = text;
296 
297         if (formatStack.empty()) {
298             super.writeString(text.trim());
299         } else {
300             super.writeString(text);
301         }
302     }
303 
304     /***
305      * Overriden method to not close certain element names to avoid wierd
306      * behaviour from browsers for versions up to 5.x
307      * 
308      * @param qualifiedName
309      *            DOCUMENT ME!
310      * 
311      * @throws IOException
312      *             DOCUMENT ME!
313      */
314     protected void writeClose(String qualifiedName) throws IOException {
315         if (!omitElementClose(qualifiedName)) {
316             super.writeClose(qualifiedName);
317         }
318     }
319 
320     protected void writeEmptyElementClose(String qualifiedName)
321             throws IOException {
322         if (getOutputFormat().isXHTML()) {
323             
324             if (omitElementClose(qualifiedName)) {
325                 
326                 
327                 
328                 
329                 
330                 writer.write(" />");
331             } else {
332                 super.writeEmptyElementClose(qualifiedName);
333             }
334         } else {
335             
336             if (omitElementClose(qualifiedName)) {
337                 
338                 writer.write(">");
339             } else {
340                 
341                 
342                 super.writeEmptyElementClose(qualifiedName);
343             }
344         }
345     }
346 
347     protected boolean omitElementClose(String qualifiedName) {
348         return internalGetOmitElementCloseSet().contains(
349                 qualifiedName.toUpperCase());
350     }
351 
352     private HashSet internalGetOmitElementCloseSet() {
353         if (omitElementCloseSet == null) {
354             omitElementCloseSet = new HashSet();
355             loadOmitElementCloseSet(omitElementCloseSet);
356         }
357 
358         return omitElementCloseSet;
359     }
360 
361     
362     protected void loadOmitElementCloseSet(Set set) {
363         set.add("AREA");
364         set.add("BASE");
365         set.add("BR");
366         set.add("COL");
367         set.add("HR");
368         set.add("IMG");
369         set.add("INPUT");
370         set.add("LINK");
371         set.add("META");
372         set.add("P");
373         set.add("PARAM");
374     }
375 
376     
377 
378     /***
379      * A clone of the Set of elements that can have their close-tags omitted. By
380      * default it should be "AREA", "BASE", "BR", "COL", "HR", "IMG", "INPUT",
381      * "LINK", "META", "P", "PARAM"
382      * 
383      * @return A clone of the Set.
384      */
385     public Set getOmitElementCloseSet() {
386         return (Set) (internalGetOmitElementCloseSet().clone());
387     }
388 
389     /***
390      * To use the empty set, pass an empty Set, or null:
391      * 
392      * <pre>
393      * 
394      * 
395      *       setOmitElementCloseSet(new HashSet());
396      *     or
397      *       setOmitElementCloseSet(null);
398      * 
399      *  
400      * </pre>
401      * 
402      * @param newSet
403      *            DOCUMENT ME!
404      */
405     public void setOmitElementCloseSet(Set newSet) {
406         
407         omitElementCloseSet = new HashSet();
408 
409         if (newSet != null) {
410             omitElementCloseSet = new HashSet();
411 
412             Object aTag;
413             Iterator iter = newSet.iterator();
414 
415             while (iter.hasNext()) {
416                 aTag = iter.next();
417 
418                 if (aTag != null) {
419                     omitElementCloseSet.add(aTag.toString().toUpperCase());
420                 }
421             }
422         }
423     }
424 
425     /***
426      * @see #setPreformattedTags(java.util.Set) setPreformattedTags
427      */
428     public Set getPreformattedTags() {
429         return (Set) (preformattedTags.clone());
430     }
431 
432     /***
433      * <p>
434      * Override the default set, which includes PRE, SCRIPT, STYLE, and
435      * TEXTAREA, case insensitively.
436      * </p>
437      * 
438      * <p>
439      * <b>Setting Preformatted Tags </b>
440      * </p>
441      * 
442      * <p>
443      * Pass in a Set of Strings, one for each tag name that should be treated
444      * like a PRE tag. You may pass in null or an empty Set to assign the empty
445      * set, in which case no tags will be treated as preformatted, except that
446      * HTML Comments will continue to be preformatted. If a tag is included in
447      * the set of preformatted tags, all whitespace within the tag will be
448      * preserved, including whitespace on the same line preceding the close tag.
449      * This will generally make the close tag not line up with the start tag,
450      * but it preserves the intention of the whitespace within the tag.
451      * </p>
452      * 
453      * <p>
454      * The browser considers leading whitespace before the close tag to be
455      * significant, but leading whitespace before the open tag to be
456      * insignificant. For example, if the HTML author doesn't put the close
457      * TEXTAREA tag flush to the left margin, then the TEXTAREA control in the
458      * browser will have spaces on the last line inside the control. This may be
459      * the HTML author's intent. Similarly, in a PRE, the browser treats a
460      * flushed left close PRE tag as different from a close tag with leading
461      * whitespace. Again, this must be left up to the HTML author.
462      * </p>
463      * 
464      * <p>
465      * <b>Examples </b>
466      * </p>
467      * <blockquote>
468      * <p>
469      * Here is an example of how you can set the PreformattedTags list using
470      * setPreformattedTags to include IFRAME, as well as the default set, if you
471      * have an instance of this class named myHTMLWriter:
472      * 
473      * <pre>
474      * Set current = myHTMLWriter.getPreformattedTags();
475      * current.add("IFRAME");
476      * myHTMLWriter.setPreformattedTags(current);
477      * 
478      * //The set is now <b>PRE, SCRIPT, STYLE, TEXTAREA, IFRAME</b>
479      * 
480      * 
481      * </pre>
482      * 
483      * Similarly, you can simply replace it with your own:
484      * 
485      * <pre>
486      * 
487      * 
488      *       HashSet newset = new HashSet();
489      *       newset.add("PRE");
490      *       newset.add("TEXTAREA");
491      *       myHTMLWriter.setPreformattedTags(newset);
492      * 
493      *       //The set is now <b>{PRE, TEXTAREA}</b>
494      * 
495      *  
496      * </pre>
497      * 
498      * You can remove all tags from the preformatted tags list, with an empty
499      * set, like this:
500      * 
501      * <pre>
502      * 
503      * 
504      *       myHTMLWriter.setPreformattedTags(new HashSet());
505      * 
506      *       //The set is now <b>{}</b>
507      * 
508      *  
509      * </pre>
510      * 
511      * or with null, like this:
512      * 
513      * <pre>
514      * 
515      * 
516      *       myHTMLWriter.setPreformattedTags(null);
517      * 
518      *       //The set is now <b>{}</b>
519      * 
520      *  
521      * </pre>
522      * 
523      * </p>
524      * </blockquote>
525      * 
526      * @param newSet
527      *            DOCUMENT ME!
528      */
529     public void setPreformattedTags(Set newSet) {
530         
531         
532         
533         
534         preformattedTags = new HashSet();
535 
536         if (newSet != null) {
537             Object aTag;
538             Iterator iter = newSet.iterator();
539 
540             while (iter.hasNext()) {
541                 aTag = iter.next();
542 
543                 if (aTag != null) {
544                     preformattedTags.add(aTag.toString().toUpperCase());
545                 }
546             }
547         }
548     }
549 
550     /***
551      * DOCUMENT ME!
552      * 
553      * @param qualifiedName
554      *            DOCUMENT ME!
555      * 
556      * @return true if the qualifiedName passed in matched (case-insensitively)
557      *         a tag in the preformattedTags set, or false if not found or if
558      *         the set is empty or null.
559      * 
560      * @see #setPreformattedTags(java.util.Set) setPreformattedTags
561      */
562     public boolean isPreformattedTag(String qualifiedName) {
563         
564         
565         return (preformattedTags != null)
566                 && (preformattedTags.contains(qualifiedName.toUpperCase()));
567     }
568 
569     /***
570      * This override handles any elements that should not remove whitespace,
571      * such as <PRE>, <SCRIPT>, <STYLE>, and <TEXTAREA>.
572      * Note: the close tags won't line up with the open tag, but we can't alter
573      * that. See javadoc note at setPreformattedTags.
574      * 
575      * @param element
576      *            DOCUMENT ME!
577      * 
578      * @throws IOException
579      *             When the stream could not be written to.
580      * 
581      * @see #setPreformattedTags(java.util.Set) setPreformattedTags
582      */
583     protected void writeElement(Element element) throws IOException {
584         if (newLineAfterNTags == -1) { 
585             lazyInitNewLinesAfterNTags();
586         }
587 
588         if (newLineAfterNTags > 0) {
589             if ((tagsOuput > 0) && ((tagsOuput % newLineAfterNTags) == 0)) {
590                 super.writer.write(lineSeparator);
591             }
592         }
593 
594         tagsOuput++;
595 
596         String qualifiedName = element.getQualifiedName();
597         String saveLastText = lastText;
598         int size = element.nodeCount();
599 
600         if (isPreformattedTag(qualifiedName)) {
601             OutputFormat currentFormat = getOutputFormat();
602             boolean saveNewlines = currentFormat.isNewlines();
603             boolean saveTrimText = currentFormat.isTrimText();
604             String currentIndent = currentFormat.getIndent();
605 
606             
607             
608             formatStack.push(new FormatState(saveNewlines, saveTrimText,
609                     currentIndent));
610 
611             try {
612                 
613                 
614                 super.writePrintln();
615 
616                 if ((saveLastText.trim().length() == 0)
617                         && (currentIndent != null)
618                         && (currentIndent.length() > 0)) {
619                     
620                     
621                     
622                     
623                     
624                     super.writer.write(justSpaces(saveLastText));
625                 }
626 
627                 
628                 
629                 currentFormat.setNewlines(false);
630                 currentFormat.setTrimText(false);
631                 currentFormat.setIndent("");
632 
633                 
634                 super.writeElement(element);
635             } finally {
636                 FormatState state = (FormatState) formatStack.pop();
637                 currentFormat.setNewlines(state.isNewlines());
638                 currentFormat.setTrimText(state.isTrimText());
639                 currentFormat.setIndent(state.getIndent());
640             }
641         } else {
642             super.writeElement(element);
643         }
644     }
645 
646     private String justSpaces(String text) {
647         int size = text.length();
648         StringBuffer res = new StringBuffer(size);
649         char c;
650 
651         for (int i = 0; i < size; i++) {
652             c = text.charAt(i);
653 
654             switch (c) {
655                 case '\r':
656                 case '\n':
657 
658                     continue;
659 
660                 default:
661                     res.append(c);
662             }
663         }
664 
665         return res.toString();
666     }
667 
668     private void lazyInitNewLinesAfterNTags() {
669         if (getOutputFormat().isNewlines()) {
670             
671             newLineAfterNTags = 0;
672         } else {
673             newLineAfterNTags = getOutputFormat().getNewLineAfterNTags();
674         }
675     }
676 
677     
678 
679     /***
680      * Convenience method to just get a String result.
681      * 
682      * @param html
683      *            DOCUMENT ME!
684      * 
685      * @return a pretty printed String from the source string, preserving
686      *         whitespace in the defaultPreformattedTags set, and leaving the
687      *         close tags off of the default omitElementCloseSet set. Use one of
688      *         the write methods if you want stream output.
689      * 
690      * @throws java.io.IOException
691      * @throws java.io.UnsupportedEncodingException
692      * @throws org.dom4j.DocumentException
693      */
694     public static String prettyPrintHTML(String html)
695             throws java.io.IOException, java.io.UnsupportedEncodingException,
696             org.dom4j.DocumentException {
697         return prettyPrintHTML(html, true, true, false, true);
698     }
699 
700     /***
701      * Convenience method to just get a String result, but <b>As XHTML </b>.
702      * 
703      * @param html
704      *            DOCUMENT ME!
705      * 
706      * @return a pretty printed String from the source string, preserving
707      *         whitespace in the defaultPreformattedTags set, but conforming to
708      *         XHTML: no close tags are omitted (though if empty, they will be
709      *         converted to XHTML empty tags: <HR/> Use one of the write
710      *         methods if you want stream output.
711      * 
712      * @throws java.io.IOException
713      * @throws java.io.UnsupportedEncodingException
714      * @throws org.dom4j.DocumentException
715      */
716     public static String prettyPrintXHTML(String html)
717             throws java.io.IOException, java.io.UnsupportedEncodingException,
718             org.dom4j.DocumentException {
719         return prettyPrintHTML(html, true, true, true, false);
720     }
721 
722     /***
723      * DOCUMENT ME!
724      * 
725      * @param html
726      *            DOCUMENT ME!
727      * @param newlines
728      *            DOCUMENT ME!
729      * @param trim
730      *            DOCUMENT ME!
731      * @param isXHTML
732      *            DOCUMENT ME!
733      * @param expandEmpty
734      *            DOCUMENT ME!
735      * 
736      * @return a pretty printed String from the source string, preserving
737      *         whitespace in the defaultPreformattedTags set, and leaving the
738      *         close tags off of the default omitElementCloseSet set. This
739      *         override allows you to specify various formatter options. Use one
740      *         of the write methods if you want stream output.
741      * 
742      * @throws java.io.IOException
743      * @throws java.io.UnsupportedEncodingException
744      * @throws org.dom4j.DocumentException
745      */
746     public static String prettyPrintHTML(String html, boolean newlines,
747             boolean trim, boolean isXHTML, boolean expandEmpty)
748             throws java.io.IOException, java.io.UnsupportedEncodingException,
749             org.dom4j.DocumentException {
750         StringWriter sw = new StringWriter();
751         OutputFormat format = OutputFormat.createPrettyPrint();
752         format.setNewlines(newlines);
753         format.setTrimText(trim);
754         format.setXHTML(isXHTML);
755         format.setExpandEmptyElements(expandEmpty);
756 
757         HTMLWriter writer = new HTMLWriter(sw, format);
758         Document document = DocumentHelper.parseText(html);
759         writer.write(document);
760         writer.flush();
761 
762         return sw.toString();
763     }
764 
765     
766     
767     private class FormatState {
768         private boolean newlines = false;
769 
770         private boolean trimText = false;
771 
772         private String indent = "";
773 
774         public FormatState(boolean newLines, boolean trimText, String indent) {
775             this.newlines = newLines;
776             this.trimText = trimText;
777             this.indent = indent;
778         }
779 
780         public boolean isNewlines() {
781             return newlines;
782         }
783 
784         public boolean isTrimText() {
785             return trimText;
786         }
787 
788         public String getIndent() {
789             return indent;
790         }
791     }
792 }
793 
794 
795 
796 
797 
798 
799 
800 
801 
802 
803 
804 
805 
806 
807 
808 
809 
810 
811 
812 
813 
814 
815 
816 
817 
818 
819 
820 
821 
822 
823 
824 
825 
826 
827 
828 
829 
830 
831 
832 
833 
834 
835 
836 
837 
838 
839 
840 
841