001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.betwixt;
018 /**
019 * <p><code>XMLUtils</code> contains basic utility methods for XML.</p>
020 *
021 * <p>The code for {@link #isWellFormedXMLName} is based on code in
022 * <code>org.apache.xerces.util.XMLChar</code>
023 * in <a href='http://xerces.apache.org/xerces2-j/index.html'>Apache Xerces</a>.
024 * The authors of this class are credited below.</p>
025 *
026 * @author Glenn Marcy, IBM
027 * @author Andy Clark, IBM
028 * @author Eric Ye, IBM
029 * @author Arnaud Le Hors, IBM
030 * @author Rahul Srivastava, Sun Microsystems Inc.
031 *
032 * @author Robert Burrell Donkin
033 * @since 0.5
034 */
035 public class XMLUtils {
036
037 // Constants
038 //-------------------------------------------------------------------------
039
040 /** Escaped <code><</code> entity */
041 public static final String LESS_THAN_ENTITY = "<";
042 /** Escaped <code>></code> entity */
043 public static final String GREATER_THAN_ENTITY = ">";
044 /** Escaped <code>&</code> entity */
045 public static final String AMPERSAND_ENTITY = "&";
046 /** Escaped <code>'</code> entity */
047 public static final String APOSTROPHE_ENTITY = "'";
048 /** Escaped <code>"</code> entity */
049 public static final String QUOTE_ENTITY = """;
050
051 // Used by isWellFormedXMLName
052 /** Name start character mask. */
053 private static final int MASK_NAME_START = 0x01;
054 /** Name character mask. */
055 private static final int MASK_NAME = 0x02;
056
057 // Class attributes
058 //-------------------------------------------------------------------------
059
060 /** Character flags. */
061 private static final byte[] CHARS = new byte[1 << 16];
062
063 //
064 // Static initialization
065 //
066
067 static {
068
069 //
070 // [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
071 // CombiningChar | Extender
072 //
073
074 int nameChar[] = {
075 0x002D, 0x002E, // '-' and '.'
076 };
077
078 //
079 // [5] Name ::= (Letter | '_' | ':') (NameChar)*
080 //
081
082 int nameStartChar[] = {
083 0x003A, 0x005F, // ':' and '_'
084 };
085
086 //
087 // [84] Letter ::= BaseChar | Ideographic
088 //
089
090 int letterRange[] = {
091 // BaseChar
092 0x0041, 0x005A, 0x0061, 0x007A, 0x00C0, 0x00D6, 0x00D8, 0x00F6,
093 0x00F8, 0x0131, 0x0134, 0x013E, 0x0141, 0x0148, 0x014A, 0x017E,
094 0x0180, 0x01C3, 0x01CD, 0x01F0, 0x01F4, 0x01F5, 0x01FA, 0x0217,
095 0x0250, 0x02A8, 0x02BB, 0x02C1, 0x0388, 0x038A, 0x038E, 0x03A1,
096 0x03A3, 0x03CE, 0x03D0, 0x03D6, 0x03E2, 0x03F3, 0x0401, 0x040C,
097 0x040E, 0x044F, 0x0451, 0x045C, 0x045E, 0x0481, 0x0490, 0x04C4,
098 0x04C7, 0x04C8, 0x04CB, 0x04CC, 0x04D0, 0x04EB, 0x04EE, 0x04F5,
099 0x04F8, 0x04F9, 0x0531, 0x0556, 0x0561, 0x0586, 0x05D0, 0x05EA,
100 0x05F0, 0x05F2, 0x0621, 0x063A, 0x0641, 0x064A, 0x0671, 0x06B7,
101 0x06BA, 0x06BE, 0x06C0, 0x06CE, 0x06D0, 0x06D3, 0x06E5, 0x06E6,
102 0x0905, 0x0939, 0x0958, 0x0961, 0x0985, 0x098C, 0x098F, 0x0990,
103 0x0993, 0x09A8, 0x09AA, 0x09B0, 0x09B6, 0x09B9, 0x09DC, 0x09DD,
104 0x09DF, 0x09E1, 0x09F0, 0x09F1, 0x0A05, 0x0A0A, 0x0A0F, 0x0A10,
105 0x0A13, 0x0A28, 0x0A2A, 0x0A30, 0x0A32, 0x0A33, 0x0A35, 0x0A36,
106 0x0A38, 0x0A39, 0x0A59, 0x0A5C, 0x0A72, 0x0A74, 0x0A85, 0x0A8B,
107 0x0A8F, 0x0A91, 0x0A93, 0x0AA8, 0x0AAA, 0x0AB0, 0x0AB2, 0x0AB3,
108 0x0AB5, 0x0AB9, 0x0B05, 0x0B0C, 0x0B0F, 0x0B10, 0x0B13, 0x0B28,
109 0x0B2A, 0x0B30, 0x0B32, 0x0B33, 0x0B36, 0x0B39, 0x0B5C, 0x0B5D,
110 0x0B5F, 0x0B61, 0x0B85, 0x0B8A, 0x0B8E, 0x0B90, 0x0B92, 0x0B95,
111 0x0B99, 0x0B9A, 0x0B9E, 0x0B9F, 0x0BA3, 0x0BA4, 0x0BA8, 0x0BAA,
112 0x0BAE, 0x0BB5, 0x0BB7, 0x0BB9, 0x0C05, 0x0C0C, 0x0C0E, 0x0C10,
113 0x0C12, 0x0C28, 0x0C2A, 0x0C33, 0x0C35, 0x0C39, 0x0C60, 0x0C61,
114 0x0C85, 0x0C8C, 0x0C8E, 0x0C90, 0x0C92, 0x0CA8, 0x0CAA, 0x0CB3,
115 0x0CB5, 0x0CB9, 0x0CE0, 0x0CE1, 0x0D05, 0x0D0C, 0x0D0E, 0x0D10,
116 0x0D12, 0x0D28, 0x0D2A, 0x0D39, 0x0D60, 0x0D61, 0x0E01, 0x0E2E,
117 0x0E32, 0x0E33, 0x0E40, 0x0E45, 0x0E81, 0x0E82, 0x0E87, 0x0E88,
118 0x0E94, 0x0E97, 0x0E99, 0x0E9F, 0x0EA1, 0x0EA3, 0x0EAA, 0x0EAB,
119 0x0EAD, 0x0EAE, 0x0EB2, 0x0EB3, 0x0EC0, 0x0EC4, 0x0F40, 0x0F47,
120 0x0F49, 0x0F69, 0x10A0, 0x10C5, 0x10D0, 0x10F6, 0x1102, 0x1103,
121 0x1105, 0x1107, 0x110B, 0x110C, 0x110E, 0x1112, 0x1154, 0x1155,
122 0x115F, 0x1161, 0x116D, 0x116E, 0x1172, 0x1173, 0x11AE, 0x11AF,
123 0x11B7, 0x11B8, 0x11BC, 0x11C2, 0x1E00, 0x1E9B, 0x1EA0, 0x1EF9,
124 0x1F00, 0x1F15, 0x1F18, 0x1F1D, 0x1F20, 0x1F45, 0x1F48, 0x1F4D,
125 0x1F50, 0x1F57, 0x1F5F, 0x1F7D, 0x1F80, 0x1FB4, 0x1FB6, 0x1FBC,
126 0x1FC2, 0x1FC4, 0x1FC6, 0x1FCC, 0x1FD0, 0x1FD3, 0x1FD6, 0x1FDB,
127 0x1FE0, 0x1FEC, 0x1FF2, 0x1FF4, 0x1FF6, 0x1FFC, 0x212A, 0x212B,
128 0x2180, 0x2182, 0x3041, 0x3094, 0x30A1, 0x30FA, 0x3105, 0x312C,
129 0xAC00, 0xD7A3,
130 // Ideographic
131 0x3021, 0x3029, 0x4E00, 0x9FA5,
132 };
133 int letterChar[] = {
134 // BaseChar
135 0x0386, 0x038C, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x0559, 0x06D5,
136 0x093D, 0x09B2, 0x0A5E, 0x0A8D, 0x0ABD, 0x0AE0, 0x0B3D, 0x0B9C,
137 0x0CDE, 0x0E30, 0x0E84, 0x0E8A, 0x0E8D, 0x0EA5, 0x0EA7, 0x0EB0,
138 0x0EBD, 0x1100, 0x1109, 0x113C, 0x113E, 0x1140, 0x114C, 0x114E,
139 0x1150, 0x1159, 0x1163, 0x1165, 0x1167, 0x1169, 0x1175, 0x119E,
140 0x11A8, 0x11AB, 0x11BA, 0x11EB, 0x11F0, 0x11F9, 0x1F59, 0x1F5B,
141 0x1F5D, 0x1FBE, 0x2126, 0x212E,
142 // Ideographic
143 0x3007,
144 };
145
146 //
147 // [87] CombiningChar ::= ...
148 //
149
150 int combiningCharRange[] = {
151 0x0300, 0x0345, 0x0360, 0x0361, 0x0483, 0x0486, 0x0591, 0x05A1,
152 0x05A3, 0x05B9, 0x05BB, 0x05BD, 0x05C1, 0x05C2, 0x064B, 0x0652,
153 0x06D6, 0x06DC, 0x06DD, 0x06DF, 0x06E0, 0x06E4, 0x06E7, 0x06E8,
154 0x06EA, 0x06ED, 0x0901, 0x0903, 0x093E, 0x094C, 0x0951, 0x0954,
155 0x0962, 0x0963, 0x0981, 0x0983, 0x09C0, 0x09C4, 0x09C7, 0x09C8,
156 0x09CB, 0x09CD, 0x09E2, 0x09E3, 0x0A40, 0x0A42, 0x0A47, 0x0A48,
157 0x0A4B, 0x0A4D, 0x0A70, 0x0A71, 0x0A81, 0x0A83, 0x0ABE, 0x0AC5,
158 0x0AC7, 0x0AC9, 0x0ACB, 0x0ACD, 0x0B01, 0x0B03, 0x0B3E, 0x0B43,
159 0x0B47, 0x0B48, 0x0B4B, 0x0B4D, 0x0B56, 0x0B57, 0x0B82, 0x0B83,
160 0x0BBE, 0x0BC2, 0x0BC6, 0x0BC8, 0x0BCA, 0x0BCD, 0x0C01, 0x0C03,
161 0x0C3E, 0x0C44, 0x0C46, 0x0C48, 0x0C4A, 0x0C4D, 0x0C55, 0x0C56,
162 0x0C82, 0x0C83, 0x0CBE, 0x0CC4, 0x0CC6, 0x0CC8, 0x0CCA, 0x0CCD,
163 0x0CD5, 0x0CD6, 0x0D02, 0x0D03, 0x0D3E, 0x0D43, 0x0D46, 0x0D48,
164 0x0D4A, 0x0D4D, 0x0E34, 0x0E3A, 0x0E47, 0x0E4E, 0x0EB4, 0x0EB9,
165 0x0EBB, 0x0EBC, 0x0EC8, 0x0ECD, 0x0F18, 0x0F19, 0x0F71, 0x0F84,
166 0x0F86, 0x0F8B, 0x0F90, 0x0F95, 0x0F99, 0x0FAD, 0x0FB1, 0x0FB7,
167 0x20D0, 0x20DC, 0x302A, 0x302F,
168 };
169
170 int combiningCharChar[] = {
171 0x05BF, 0x05C4, 0x0670, 0x093C, 0x094D, 0x09BC, 0x09BE, 0x09BF,
172 0x09D7, 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, 0x0ABC, 0x0B3C, 0x0BD7,
173 0x0D57, 0x0E31, 0x0EB1, 0x0F35, 0x0F37, 0x0F39, 0x0F3E, 0x0F3F,
174 0x0F97, 0x0FB9, 0x20E1, 0x3099, 0x309A,
175 };
176
177 //
178 // [88] Digit ::= ...
179 //
180
181 int digitRange[] = {
182 0x0030, 0x0039, 0x0660, 0x0669, 0x06F0, 0x06F9, 0x0966, 0x096F,
183 0x09E6, 0x09EF, 0x0A66, 0x0A6F, 0x0AE6, 0x0AEF, 0x0B66, 0x0B6F,
184 0x0BE7, 0x0BEF, 0x0C66, 0x0C6F, 0x0CE6, 0x0CEF, 0x0D66, 0x0D6F,
185 0x0E50, 0x0E59, 0x0ED0, 0x0ED9, 0x0F20, 0x0F29,
186 };
187
188 //
189 // [89] Extender ::= ...
190 //
191
192 int extenderRange[] = {
193 0x3031, 0x3035, 0x309D, 0x309E, 0x30FC, 0x30FE,
194 };
195
196 int extenderChar[] = {
197 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005,
198 };
199
200 //
201 // Initialize
202 //
203
204 // set name start characters
205 for (int i = 0; i < nameStartChar.length; i++) {
206 CHARS[nameStartChar[i]] |= MASK_NAME_START | MASK_NAME;
207 }
208 for (int i = 0; i < letterRange.length; i += 2) {
209 for (int j = letterRange[i]; j <= letterRange[i + 1]; j++) {
210 CHARS[j] |= MASK_NAME_START | MASK_NAME;
211 }
212 }
213 for (int i = 0; i < letterChar.length; i++) {
214 CHARS[letterChar[i]] |= MASK_NAME_START | MASK_NAME;
215 }
216
217 // set name characters
218 for (int i = 0; i < nameChar.length; i++) {
219 CHARS[nameChar[i]] |= MASK_NAME;
220 }
221 for (int i = 0; i < digitRange.length; i += 2) {
222 for (int j = digitRange[i]; j <= digitRange[i + 1]; j++) {
223 CHARS[j] |= MASK_NAME;
224 }
225 }
226 for (int i = 0; i < combiningCharRange.length; i += 2) {
227 for (int j = combiningCharRange[i]; j <= combiningCharRange[i + 1]; j++) {
228 CHARS[j] |= MASK_NAME;
229 }
230 }
231 for (int i = 0; i < combiningCharChar.length; i++) {
232 CHARS[combiningCharChar[i]] |= MASK_NAME;
233 }
234 for (int i = 0; i < extenderRange.length; i += 2) {
235 for (int j = extenderRange[i]; j <= extenderRange[i + 1]; j++) {
236 CHARS[j] |= MASK_NAME;
237 }
238 }
239 for (int i = 0; i < extenderChar.length; i++) {
240 CHARS[extenderChar[i]] |= MASK_NAME;
241 }
242
243 }
244
245 // Constructor
246 //-------------------------------------------------------------------------
247
248 /**
249 * <p>Constructor for use by tools that required <code>JavaBean</code> instances.</p>
250 *
251 * <p>This constructor is public <strong>only</strong>
252 * to permit tools that require a JavaBean instance to operate.
253 * <code>XMLUtils</code> instances should <strong>not</strong> be constructed in standard
254 * programming. Instead, the class methods should be called directly.</p>
255 */
256 public XMLUtils() {}
257
258 // Class methods
259 //-------------------------------------------------------------------------
260
261 /**
262 * <p>Escape the <code>toString</code> of the given object.
263 * For use as body text.</p>
264 *
265 * @param value escape <code>value.toString()</code>
266 * @return text with escaped delimiters
267 */
268 public static final String escapeBodyValue(Object value) {
269 StringBuffer buffer = new StringBuffer(value.toString());
270 for (int i=0, size = buffer.length(); i <size; i++) {
271 switch (buffer.charAt(i)) {
272 case '<':
273 buffer.replace(i, i+1, LESS_THAN_ENTITY);
274 size += 3;
275 i+=3;
276 break;
277 case '>':
278 buffer.replace(i, i+1, GREATER_THAN_ENTITY);
279 size += 3;
280 i += 3;
281 break;
282 case '&':
283 buffer.replace(i, i+1, AMPERSAND_ENTITY);
284 size += 4;
285 i += 4;
286 break;
287 }
288 }
289 return buffer.toString();
290 }
291
292 /**
293 * <p>Escape the <code>toString</code> of the given object.
294 * For use in an attribute value.</p>
295 *
296 * @param value escape <code>value.toString()</code>
297 * @return text with characters restricted (for use in attributes) escaped
298 */
299 public static final String escapeAttributeValue(Object value) {
300 StringBuffer buffer = new StringBuffer(value.toString());
301 for (int i=0, size = buffer.length(); i <size; i++) {
302 switch (buffer.charAt(i)) {
303 case '<':
304 buffer.replace(i, i+1, LESS_THAN_ENTITY);
305 size += 3;
306 i+=3;
307 break;
308 case '>':
309 buffer.replace(i, i+1, GREATER_THAN_ENTITY);
310 size += 3;
311 i += 3;
312 break;
313 case '&':
314 buffer.replace(i, i+1, AMPERSAND_ENTITY);
315 size += 4;
316 i += 4;
317 break;
318 case '\'':
319 buffer.replace(i, i+1, APOSTROPHE_ENTITY);
320 size += 5;
321 i += 5;
322 break;
323 case '\"':
324 buffer.replace(i, i+1, QUOTE_ENTITY);
325 size += 5;
326 i += 5;
327 break;
328 }
329 }
330 return buffer.toString();
331 }
332
333
334 /**
335 * Escapes the given content suitable for insertion within a
336 * <code>CDATA</code> sequence.
337 * Within a <code>CDATA</code> section, only the <code>CDEnd</code>
338 * string ']]>' is recognized as markup.
339 * @param content the body content whose character data should
340 * be escaped in a way appropriate for use within a <code>CDATA</code>
341 * section of xml.
342 * @return escaped character data, not null
343 */
344 public static final String escapeCDATAContent(String content) {
345 StringBuffer buffer = new StringBuffer(content);
346 escapeCDATAContent(buffer);
347 return buffer.toString();
348 }
349
350 /**
351 * Escapes the given content suitable for insertion within a
352 * <code>CDATA</code> sequence.
353 * Within a <code>CDATA</code> section, only the <code>CDEnd</code>
354 * string ']]>' is recognized as markup.
355 * @param bufferedContent the body content within a buffer
356 * whose character data should
357 * be escaped in a way appropriate for use within a <code>CDATA</code>
358 * section of xml
359 */
360 public static final void escapeCDATAContent(StringBuffer bufferedContent) {
361 for (int i=2, size = bufferedContent.length(); i<size; i++) {
362 char at = bufferedContent.charAt(i);
363 if ( at == '>'
364 && bufferedContent.charAt(i-1) == ']'
365 && bufferedContent.charAt(i-2) == ']') {
366
367 bufferedContent.replace(i, i+1, GREATER_THAN_ENTITY);
368 size += 3;
369 i+=3;
370 }
371 }
372 }
373
374
375 /**
376 * <p>Is this string a well formed xml name?</p>
377 *
378 * <p>Only certain characters are allowed in well formed element and attribute
379 * names in xml. For example, white space is not allowed in a name.</p>
380 *
381 * <p>The code for this method is based on code in
382 * <code>org.apache.xerces.util.XMLChar</code>
383 * in <a href='http://xerces.apache.org/xerces2-j/index.html'>Apache Xerces</a>.
384 * The authors of this class are credited at the top of this class.</p>
385 *
386 * @param name the <code>String</code> to be checked for use as an xml attribute
387 * or element name. Returns false if <code>name</code> is null
388 * @return true if this string would be a well-formed name
389 */
390 public static boolean isWellFormedXMLName( String name ) {
391 if ( name == null ) {
392 return false;
393 }
394
395 if ( name.length() == 0 ) {
396 return false;
397 }
398
399 char ch = name.charAt(0);
400 if( isNameStartChar(ch) == false) {
401 return false;
402
403 }
404
405 for (int i = 1; i < name.length(); i++ ) {
406 ch = name.charAt(i);
407 if( isNameChar( ch ) == false ) {
408 return false;
409 }
410 }
411 return true;
412 }
413
414 /**
415 * Returns true if the specified character is a valid name
416 * character as defined by the XML 1.0 specification.
417 *
418 * @param c The character to check.
419 * @return true if this is an XML name character
420 */
421 public static boolean isNameChar(int c) {
422 return c < 0x10000 && (CHARS[c] & MASK_NAME) != 0;
423 }
424
425 /**
426 * Returns true if the specified character is a valid name start
427 * character as defined in the XML 1.0 specification.
428 *
429 * @param c The character to check.
430 * @return trus if this is an XML name start character
431 */
432 public static boolean isNameStartChar(int c) {
433 return c < 0x10000 && (CHARS[c] & MASK_NAME_START) != 0;
434 }
435 }