001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.lang.text;
018
019 import java.util.ArrayList;
020 import java.util.Collections;
021 import java.util.List;
022 import java.util.ListIterator;
023 import java.util.NoSuchElementException;
024
025 /**
026 * Tokenizes a string based based on delimiters (separators)
027 * and supporting quoting and ignored character concepts.
028 * <p>
029 * This class can split a String into many smaller strings. It aims
030 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
031 * however it offers much more control and flexibility including implementing
032 * the <code>ListIterator</code> interface. By default, it is set up
033 * like <code>StringTokenizer</code>.
034 * <p>
035 * The input String is split into a number of <i>tokens</i>.
036 * Each token is separated from the next String by a <i>delimiter</i>.
037 * One or more delimiter characters must be specified.
038 * <p>
039 * Each token may be surrounded by quotes.
040 * The <i>quote</i> matcher specifies the quote character(s).
041 * A quote may be escaped within a quoted section by duplicating itself.
042 * <p>
043 * Between each token and the delimiter are potentially characters that need trimming.
044 * The <i>trimmer</i> matcher specifies these characters.
045 * One usage might be to trim whitespace characters.
046 * <p>
047 * At any point outside the quotes there might potentially be invalid characters.
048 * The <i>ignored</i> matcher specifies these characters to be removed.
049 * One usage might be to remove new line characters.
050 * <p>
051 * Empty tokens may be removed or returned as null.
052 * <pre>
053 * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
054 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
055 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
056 * </pre>
057 * <p>
058 *
059 * This tokenizer has the following properties and options:
060 *
061 * <table>
062 * <tr>
063 * <th>Property</th><th>Type</th><th>Default</th>
064 * </tr>
065 * <tr>
066 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
067 * </tr>
068 * <tr>
069 * <td>quote</td><td>NoneMatcher</td><td>{}</td>
070 * </tr>
071 * <tr>
072 * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
073 * </tr>
074 * <tr>
075 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
076 * </tr>
077 * <tr>
078 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
079 * </tr>
080 * </table>
081 *
082 * @author Matthew Inger
083 * @author Stephen Colebourne
084 * @author Gary D. Gregory
085 * @since 2.2
086 * @version $Id: StrTokenizer.java 592077 2007-11-05 16:47:10Z mbenson $
087 */
088 public class StrTokenizer implements ListIterator, Cloneable {
089
090 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
091 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
092 static {
093 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
094 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
095 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
096 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
097 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
098 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
099 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
100
101 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
102 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
103 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
104 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
105 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
106 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
107 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
108 }
109
110 /** The text to work on. */
111 private char chars[];
112 /** The parsed tokens */
113 private String tokens[];
114 /** The current iteration position */
115 private int tokenPos;
116
117 /** The delimiter matcher */
118 private StrMatcher delimMatcher = StrMatcher.splitMatcher();
119 /** The quote matcher */
120 private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
121 /** The ignored matcher */
122 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
123 /** The trimmer matcher */
124 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
125
126 /** Whether to return empty tokens as null */
127 private boolean emptyAsNull = false;
128 /** Whether to ignore empty tokens */
129 private boolean ignoreEmptyTokens = true;
130
131 //-----------------------------------------------------------------------
132
133 /**
134 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
135 *
136 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>.
137 */
138 private static StrTokenizer getCSVClone() {
139 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
140 }
141
142 /**
143 * Gets a new tokenizer instance which parses Comma Separated Value strings
144 * initializing it with the given input. The default for CSV processing
145 * will be trim whitespace from both ends (which can be overridden with
146 * the setTrimmer method).
147 * <p>
148 * You must call a "reset" method to set the string which you want to parse.
149 * @return a new tokenizer instance which parses Comma Separated Value strings
150 */
151 public static StrTokenizer getCSVInstance() {
152 return getCSVClone();
153 }
154
155 /**
156 * Gets a new tokenizer instance which parses Comma Separated Value strings
157 * initializing it with the given input. The default for CSV processing
158 * will be trim whitespace from both ends (which can be overridden with
159 * the setTrimmer method).
160 *
161 * @param input the text to parse
162 * @return a new tokenizer instance which parses Comma Separated Value strings
163 */
164 public static StrTokenizer getCSVInstance(String input) {
165 StrTokenizer tok = getCSVClone();
166 tok.reset(input);
167 return tok;
168 }
169
170 /**
171 * Gets a new tokenizer instance which parses Comma Separated Value strings
172 * initializing it with the given input. The default for CSV processing
173 * will be trim whitespace from both ends (which can be overridden with
174 * the setTrimmer method).
175 *
176 * @param input the text to parse
177 * @return a new tokenizer instance which parses Comma Separated Value strings
178 */
179 public static StrTokenizer getCSVInstance(char[] input) {
180 StrTokenizer tok = getCSVClone();
181 tok.reset(input);
182 return tok;
183 }
184
185 /**
186 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
187 *
188 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>.
189 */
190 private static StrTokenizer getTSVClone() {
191 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
192 }
193
194
195 /**
196 * Gets a new tokenizer instance which parses Tab Separated Value strings.
197 * The default for CSV processing will be trim whitespace from both ends
198 * (which can be overridden with the setTrimmer method).
199 * <p>
200 * You must call a "reset" method to set the string which you want to parse.
201 * @return a new tokenizer instance which parses Tab Separated Value strings.
202 */
203 public static StrTokenizer getTSVInstance() {
204 return getTSVClone();
205 }
206
207 /**
208 * Gets a new tokenizer instance which parses Tab Separated Value strings.
209 * The default for CSV processing will be trim whitespace from both ends
210 * (which can be overridden with the setTrimmer method).
211 * @param input the string to parse
212 * @return a new tokenizer instance which parses Tab Separated Value strings.
213 */
214 public static StrTokenizer getTSVInstance(String input) {
215 StrTokenizer tok = getTSVClone();
216 tok.reset(input);
217 return tok;
218 }
219
220 /**
221 * Gets a new tokenizer instance which parses Tab Separated Value strings.
222 * The default for CSV processing will be trim whitespace from both ends
223 * (which can be overridden with the setTrimmer method).
224 * @param input the string to parse
225 * @return a new tokenizer instance which parses Tab Separated Value strings.
226 */
227 public static StrTokenizer getTSVInstance(char[] input) {
228 StrTokenizer tok = getTSVClone();
229 tok.reset(input);
230 return tok;
231 }
232
233 //-----------------------------------------------------------------------
234 /**
235 * Constructs a tokenizer splitting on space, tab, newline and formfeed
236 * as per StringTokenizer, but with no text to tokenize.
237 * <p>
238 * This constructor is normally used with {@link #reset(String)}.
239 */
240 public StrTokenizer() {
241 super();
242 this.chars = null;
243 }
244
245 /**
246 * Constructs a tokenizer splitting on space, tab, newline and formfeed
247 * as per StringTokenizer.
248 *
249 * @param input the string which is to be parsed
250 */
251 public StrTokenizer(String input) {
252 super();
253 if (input != null) {
254 chars = input.toCharArray();
255 } else {
256 chars = null;
257 }
258 }
259
260 /**
261 * Constructs a tokenizer splitting on the specified delimiter character.
262 *
263 * @param input the string which is to be parsed
264 * @param delim the field delimiter character
265 */
266 public StrTokenizer(String input, char delim) {
267 this(input);
268 setDelimiterChar(delim);
269 }
270
271 /**
272 * Constructs a tokenizer splitting on the specified delimiter string.
273 *
274 * @param input the string which is to be parsed
275 * @param delim the field delimiter string
276 */
277 public StrTokenizer(String input, String delim) {
278 this(input);
279 setDelimiterString(delim);
280 }
281
282 /**
283 * Constructs a tokenizer splitting using the specified delimiter matcher.
284 *
285 * @param input the string which is to be parsed
286 * @param delim the field delimiter matcher
287 */
288 public StrTokenizer(String input, StrMatcher delim) {
289 this(input);
290 setDelimiterMatcher(delim);
291 }
292
293 /**
294 * Constructs a tokenizer splitting on the specified delimiter character
295 * and handling quotes using the specified quote character.
296 *
297 * @param input the string which is to be parsed
298 * @param delim the field delimiter character
299 * @param quote the field quoted string character
300 */
301 public StrTokenizer(String input, char delim, char quote) {
302 this(input, delim);
303 setQuoteChar(quote);
304 }
305
306 /**
307 * Constructs a tokenizer splitting using the specified delimiter matcher
308 * and handling quotes using the specified quote matcher.
309 *
310 * @param input the string which is to be parsed
311 * @param delim the field delimiter matcher
312 * @param quote the field quoted string matcher
313 */
314 public StrTokenizer(String input, StrMatcher delim, StrMatcher quote) {
315 this(input, delim);
316 setQuoteMatcher(quote);
317 }
318
319 /**
320 * Constructs a tokenizer splitting on space, tab, newline and formfeed
321 * as per StringTokenizer.
322 * <p>
323 * The input character array is not cloned, and must not be altered after
324 * passing in to this method.
325 *
326 * @param input the string which is to be parsed, not cloned
327 */
328 public StrTokenizer(char[] input) {
329 super();
330 this.chars = input;
331 }
332
333 /**
334 * Constructs a tokenizer splitting on the specified character.
335 * <p>
336 * The input character array is not cloned, and must not be altered after
337 * passing in to this method.
338 *
339 * @param input the string which is to be parsed, not cloned
340 * @param delim the field delimiter character
341 */
342 public StrTokenizer(char[] input, char delim) {
343 this(input);
344 setDelimiterChar(delim);
345 }
346
347 /**
348 * Constructs a tokenizer splitting on the specified string.
349 * <p>
350 * The input character array is not cloned, and must not be altered after
351 * passing in to this method.
352 *
353 * @param input the string which is to be parsed, not cloned
354 * @param delim the field delimiter string
355 */
356 public StrTokenizer(char[] input, String delim) {
357 this(input);
358 setDelimiterString(delim);
359 }
360
361 /**
362 * Constructs a tokenizer splitting using the specified delimiter matcher.
363 * <p>
364 * The input character array is not cloned, and must not be altered after
365 * passing in to this method.
366 *
367 * @param input the string which is to be parsed, not cloned
368 * @param delim the field delimiter matcher
369 */
370 public StrTokenizer(char[] input, StrMatcher delim) {
371 this(input);
372 setDelimiterMatcher(delim);
373 }
374
375 /**
376 * Constructs a tokenizer splitting on the specified delimiter character
377 * and handling quotes using the specified quote character.
378 * <p>
379 * The input character array is not cloned, and must not be altered after
380 * passing in to this method.
381 *
382 * @param input the string which is to be parsed, not cloned
383 * @param delim the field delimiter character
384 * @param quote the field quoted string character
385 */
386 public StrTokenizer(char[] input, char delim, char quote) {
387 this(input, delim);
388 setQuoteChar(quote);
389 }
390
391 /**
392 * Constructs a tokenizer splitting using the specified delimiter matcher
393 * and handling quotes using the specified quote matcher.
394 * <p>
395 * The input character array is not cloned, and must not be altered after
396 * passing in to this method.
397 *
398 * @param input the string which is to be parsed, not cloned
399 * @param delim the field delimiter character
400 * @param quote the field quoted string character
401 */
402 public StrTokenizer(char[] input, StrMatcher delim, StrMatcher quote) {
403 this(input, delim);
404 setQuoteMatcher(quote);
405 }
406
407 // API
408 //-----------------------------------------------------------------------
409 /**
410 * Gets the number of tokens found in the String.
411 *
412 * @return the number of matched tokens
413 */
414 public int size() {
415 checkTokenized();
416 return tokens.length;
417 }
418
419 /**
420 * Gets the next token from the String.
421 *
422 * @return the next sequential token, or null when no more tokens are found
423 */
424 public String nextToken() {
425 if (hasNext()) {
426 return tokens[tokenPos++];
427 }
428 return null;
429 }
430
431 /**
432 * Gets the previous token from the String.
433 *
434 * @return the previous sequential token, or null when no more tokens are found
435 */
436 public String previousToken() {
437 if (hasPrevious()) {
438 return tokens[--tokenPos];
439 }
440 return null;
441 }
442
443 /**
444 * Gets a copy of the full token list as an independent modifiable array.
445 *
446 * @return the tokens as a String array
447 */
448 public String[] getTokenArray() {
449 checkTokenized();
450 return (String[]) tokens.clone();
451 }
452
453 /**
454 * Gets a copy of the full token list as an independent modifiable list.
455 *
456 * @return the tokens as a String array
457 */
458 public List getTokenList() {
459 checkTokenized();
460 List list = new ArrayList(tokens.length);
461 for (int i = 0; i < tokens.length; i++) {
462 list.add(tokens[i]);
463 }
464 return list;
465 }
466
467 /**
468 * Resets this tokenizer, forgetting all parsing and iteration already completed.
469 * <p>
470 * This method allows the same tokenizer to be reused for the same String.
471 *
472 * @return this, to enable chaining
473 */
474 public StrTokenizer reset() {
475 tokenPos = 0;
476 tokens = null;
477 return this;
478 }
479
480 /**
481 * Reset this tokenizer, giving it a new input string to parse.
482 * In this manner you can re-use a tokenizer with the same settings
483 * on multiple input lines.
484 *
485 * @param input the new string to tokenize, null sets no text to parse
486 * @return this, to enable chaining
487 */
488 public StrTokenizer reset(String input) {
489 reset();
490 if (input != null) {
491 this.chars = input.toCharArray();
492 } else {
493 this.chars = null;
494 }
495 return this;
496 }
497
498 /**
499 * Reset this tokenizer, giving it a new input string to parse.
500 * In this manner you can re-use a tokenizer with the same settings
501 * on multiple input lines.
502 * <p>
503 * The input character array is not cloned, and must not be altered after
504 * passing in to this method.
505 *
506 * @param input the new character array to tokenize, not cloned, null sets no text to parse
507 * @return this, to enable chaining
508 */
509 public StrTokenizer reset(char[] input) {
510 reset();
511 this.chars = input;
512 return this;
513 }
514
515 // ListIterator
516 //-----------------------------------------------------------------------
517 /**
518 * Checks whether there are any more tokens.
519 *
520 * @return true if there are more tokens
521 */
522 public boolean hasNext() {
523 checkTokenized();
524 return tokenPos < tokens.length;
525 }
526
527 /**
528 * Gets the next token. This method is equivalent to {@link #nextToken()}.
529 *
530 * @return the next String token
531 */
532 public Object next() {
533 if (hasNext()) {
534 return tokens[tokenPos++];
535 }
536 throw new NoSuchElementException();
537 }
538
539 /**
540 * Gets the index of the next token to return.
541 *
542 * @return the next token index
543 */
544 public int nextIndex() {
545 return tokenPos;
546 }
547
548 /**
549 * Checks whether there are any previous tokens that can be iterated to.
550 *
551 * @return true if there are previous tokens
552 */
553 public boolean hasPrevious() {
554 checkTokenized();
555 return tokenPos > 0;
556 }
557
558 /**
559 * Gets the token previous to the last returned token.
560 *
561 * @return the previous token
562 */
563 public Object previous() {
564 if (hasPrevious()) {
565 return tokens[--tokenPos];
566 }
567 throw new NoSuchElementException();
568 }
569
570 /**
571 * Gets the index of the previous token.
572 *
573 * @return the previous token index
574 */
575 public int previousIndex() {
576 return tokenPos - 1;
577 }
578
579 /**
580 * Unsupported ListIterator operation.
581 *
582 * @throws UnsupportedOperationException always
583 */
584 public void remove() {
585 throw new UnsupportedOperationException("remove() is unsupported");
586 }
587
588 /**
589 * Unsupported ListIterator operation.
590 * @param obj this parameter ignored.
591 * @throws UnsupportedOperationException always
592 */
593 public void set(Object obj) {
594 throw new UnsupportedOperationException("set() is unsupported");
595 }
596
597 /**
598 * Unsupported ListIterator operation.
599 * @param obj this parameter ignored.
600 * @throws UnsupportedOperationException always
601 */
602 public void add(Object obj) {
603 throw new UnsupportedOperationException("add() is unsupported");
604 }
605
606 // Implementation
607 //-----------------------------------------------------------------------
608 /**
609 * Checks if tokenization has been done, and if not then do it.
610 */
611 private void checkTokenized() {
612 if (tokens == null) {
613 if (chars == null) {
614 // still call tokenize as subclass may do some work
615 List split = tokenize(null, 0, 0);
616 tokens = (String[]) split.toArray(new String[split.size()]);
617 } else {
618 List split = tokenize(chars, 0, chars.length);
619 tokens = (String[]) split.toArray(new String[split.size()]);
620 }
621 }
622 }
623
624 /**
625 * Internal method to performs the tokenization.
626 * <p>
627 * Most users of this class do not need to call this method. This method
628 * will be called automatically by other (public) methods when required.
629 * <p>
630 * This method exists to allow subclasses to add code before or after the
631 * tokenization. For example, a subclass could alter the character array,
632 * offset or count to be parsed, or call the tokenizer multiple times on
633 * multiple strings. It is also be possible to filter the results.
634 * <p>
635 * <code>StrTokenizer</code> will always pass a zero offset and a count
636 * equal to the length of the array to this method, however a subclass
637 * may pass other values, or even an entirely different array.
638 *
639 * @param chars the character array being tokenized, may be null
640 * @param offset the start position within the character array, must be valid
641 * @param count the number of characters to tokenize, must be valid
642 * @return the modifiable list of String tokens, unmodifiable if null array or zero count
643 */
644 protected List tokenize(char[] chars, int offset, int count) {
645 if (chars == null || count == 0) {
646 return Collections.EMPTY_LIST;
647 }
648 StrBuilder buf = new StrBuilder();
649 List tokens = new ArrayList();
650 int pos = offset;
651
652 // loop around the entire buffer
653 while (pos >= 0 && pos < count) {
654 // find next token
655 pos = readNextToken(chars, pos, count, buf, tokens);
656
657 // handle case where end of string is a delimiter
658 if (pos >= count) {
659 addToken(tokens, "");
660 }
661 }
662 return tokens;
663 }
664
665 /**
666 * Adds a token to a list, paying attention to the parameters we've set.
667 *
668 * @param list the list to add to
669 * @param tok the token to add
670 */
671 private void addToken(List list, String tok) {
672 if (tok == null || tok.length() == 0) {
673 if (isIgnoreEmptyTokens()) {
674 return;
675 }
676 if (isEmptyTokenAsNull()) {
677 tok = null;
678 }
679 }
680 list.add(tok);
681 }
682
683 /**
684 * Reads character by character through the String to get the next token.
685 *
686 * @param chars the character array being tokenized
687 * @param start the first character of field
688 * @param len the length of the character array being tokenized
689 * @param workArea a temporary work area
690 * @param tokens the list of parsed tokens
691 * @return the starting position of the next field (the character
692 * immediately after the delimiter), or -1 if end of string found
693 */
694 private int readNextToken(char[] chars, int start, int len, StrBuilder workArea, List tokens) {
695 // skip all leading whitespace, unless it is the
696 // field delimiter or the quote character
697 while (start < len) {
698 int removeLen = Math.max(
699 getIgnoredMatcher().isMatch(chars, start, start, len),
700 getTrimmerMatcher().isMatch(chars, start, start, len));
701 if (removeLen == 0 ||
702 getDelimiterMatcher().isMatch(chars, start, start, len) > 0 ||
703 getQuoteMatcher().isMatch(chars, start, start, len) > 0) {
704 break;
705 }
706 start += removeLen;
707 }
708
709 // handle reaching end
710 if (start >= len) {
711 addToken(tokens, "");
712 return -1;
713 }
714
715 // handle empty token
716 int delimLen = getDelimiterMatcher().isMatch(chars, start, start, len);
717 if (delimLen > 0) {
718 addToken(tokens, "");
719 return start + delimLen;
720 }
721
722 // handle found token
723 int quoteLen = getQuoteMatcher().isMatch(chars, start, start, len);
724 if (quoteLen > 0) {
725 return readWithQuotes(chars, start + quoteLen, len, workArea, tokens, start, quoteLen);
726 }
727 return readWithQuotes(chars, start, len, workArea, tokens, 0, 0);
728 }
729
730 /**
731 * Reads a possibly quoted string token.
732 *
733 * @param chars the character array being tokenized
734 * @param start the first character of field
735 * @param len the length of the character array being tokenized
736 * @param workArea a temporary work area
737 * @param tokens the list of parsed tokens
738 * @param quoteStart the start position of the matched quote, 0 if no quoting
739 * @param quoteLen the length of the matched quote, 0 if no quoting
740 * @return the starting position of the next field (the character
741 * immediately after the delimiter, or if end of string found,
742 * then the length of string
743 */
744 private int readWithQuotes(char[] chars, int start, int len, StrBuilder workArea,
745 List tokens, int quoteStart, int quoteLen)
746 {
747 // Loop until we've found the end of the quoted
748 // string or the end of the input
749 workArea.clear();
750 int pos = start;
751 boolean quoting = (quoteLen > 0);
752 int trimStart = 0;
753
754 while (pos < len) {
755 // quoting mode can occur several times throughout a string
756 // we must switch between quoting and non-quoting until we
757 // encounter a non-quoted delimiter, or end of string
758 if (quoting) {
759 // In quoting mode
760
761 // If we've found a quote character, see if it's
762 // followed by a second quote. If so, then we need
763 // to actually put the quote character into the token
764 // rather than end the token.
765 if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
766 if (isQuote(chars, pos + quoteLen, len, quoteStart, quoteLen)) {
767 // matched pair of quotes, thus an escaped quote
768 workArea.append(chars, pos, quoteLen);
769 pos += (quoteLen * 2);
770 trimStart = workArea.size();
771 continue;
772 }
773
774 // end of quoting
775 quoting = false;
776 pos += quoteLen;
777 continue;
778 }
779
780 // copy regular character from inside quotes
781 workArea.append(chars[pos++]);
782 trimStart = workArea.size();
783
784 } else {
785 // Not in quoting mode
786
787 // check for delimiter, and thus end of token
788 int delimLen = getDelimiterMatcher().isMatch(chars, pos, start, len);
789 if (delimLen > 0) {
790 // return condition when end of token found
791 addToken(tokens, workArea.substring(0, trimStart));
792 return pos + delimLen;
793 }
794
795 // check for quote, and thus back into quoting mode
796 if (quoteLen > 0) {
797 if (isQuote(chars, pos, len, quoteStart, quoteLen)) {
798 quoting = true;
799 pos += quoteLen;
800 continue;
801 }
802 }
803
804 // check for ignored (outside quotes), and ignore
805 int ignoredLen = getIgnoredMatcher().isMatch(chars, pos, start, len);
806 if (ignoredLen > 0) {
807 pos += ignoredLen;
808 continue;
809 }
810
811 // check for trimmed character
812 // don't yet know if its at the end, so copy to workArea
813 // use trimStart to keep track of trim at the end
814 int trimmedLen = getTrimmerMatcher().isMatch(chars, pos, start, len);
815 if (trimmedLen > 0) {
816 workArea.append(chars, pos, trimmedLen);
817 pos += trimmedLen;
818 continue;
819 }
820
821 // copy regular character from outside quotes
822 workArea.append(chars[pos++]);
823 trimStart = workArea.size();
824 }
825 }
826
827 // return condition when end of string found
828 addToken(tokens, workArea.substring(0, trimStart));
829 return -1;
830 }
831
832 /**
833 * Checks if the characters at the index specified match the quote
834 * already matched in readNextToken().
835 *
836 * @param chars the character array being tokenized
837 * @param pos the position to check for a quote
838 * @param len the length of the character array being tokenized
839 * @param quoteStart the start position of the matched quote, 0 if no quoting
840 * @param quoteLen the length of the matched quote, 0 if no quoting
841 * @return true if a quote is matched
842 */
843 private boolean isQuote(char[] chars, int pos, int len, int quoteStart, int quoteLen) {
844 for (int i = 0; i < quoteLen; i++) {
845 if ((pos + i) >= len || chars[pos + i] != chars[quoteStart + i]) {
846 return false;
847 }
848 }
849 return true;
850 }
851
852 // Delimiter
853 //-----------------------------------------------------------------------
854 /**
855 * Gets the field delimiter matcher.
856 *
857 * @return the delimiter matcher in use
858 */
859 public StrMatcher getDelimiterMatcher() {
860 return this.delimMatcher;
861 }
862
863 /**
864 * Sets the field delimiter matcher.
865 * <p>
866 * The delimitier is used to separate one token from another.
867 *
868 * @param delim the delimiter matcher to use
869 * @return this, to enable chaining
870 */
871 public StrTokenizer setDelimiterMatcher(StrMatcher delim) {
872 if (delim == null) {
873 this.delimMatcher = StrMatcher.noneMatcher();
874 } else {
875 this.delimMatcher = delim;
876 }
877 return this;
878 }
879
880 /**
881 * Sets the field delimiter character.
882 *
883 * @param delim the delimiter character to use
884 * @return this, to enable chaining
885 */
886 public StrTokenizer setDelimiterChar(char delim) {
887 return setDelimiterMatcher(StrMatcher.charMatcher(delim));
888 }
889
890 /**
891 * Sets the field delimiter string.
892 *
893 * @param delim the delimiter string to use
894 * @return this, to enable chaining
895 */
896 public StrTokenizer setDelimiterString(String delim) {
897 return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
898 }
899
900 // Quote
901 //-----------------------------------------------------------------------
902 /**
903 * Gets the quote matcher currently in use.
904 * <p>
905 * The quote character is used to wrap data between the tokens.
906 * This enables delimiters to be entered as data.
907 * The default value is '"' (double quote).
908 *
909 * @return the quote matcher in use
910 */
911 public StrMatcher getQuoteMatcher() {
912 return quoteMatcher;
913 }
914
915 /**
916 * Set the quote matcher to use.
917 * <p>
918 * The quote character is used to wrap data between the tokens.
919 * This enables delimiters to be entered as data.
920 *
921 * @param quote the quote matcher to use, null ignored
922 * @return this, to enable chaining
923 */
924 public StrTokenizer setQuoteMatcher(StrMatcher quote) {
925 if (quote != null) {
926 this.quoteMatcher = quote;
927 }
928 return this;
929 }
930
931 /**
932 * Sets the quote character to use.
933 * <p>
934 * The quote character is used to wrap data between the tokens.
935 * This enables delimiters to be entered as data.
936 *
937 * @param quote the quote character to use
938 * @return this, to enable chaining
939 */
940 public StrTokenizer setQuoteChar(char quote) {
941 return setQuoteMatcher(StrMatcher.charMatcher(quote));
942 }
943
944 // Ignored
945 //-----------------------------------------------------------------------
946 /**
947 * Gets the ignored character matcher.
948 * <p>
949 * These characters are ignored when parsing the String, unless they are
950 * within a quoted region.
951 * The default value is not to ignore anything.
952 *
953 * @return the ignored matcher in use
954 */
955 public StrMatcher getIgnoredMatcher() {
956 return ignoredMatcher;
957 }
958
959 /**
960 * Set the matcher for characters to ignore.
961 * <p>
962 * These characters are ignored when parsing the String, unless they are
963 * within a quoted region.
964 *
965 * @param ignored the ignored matcher to use, null ignored
966 * @return this, to enable chaining
967 */
968 public StrTokenizer setIgnoredMatcher(StrMatcher ignored) {
969 if (ignored != null) {
970 this.ignoredMatcher = ignored;
971 }
972 return this;
973 }
974
975 /**
976 * Set the character to ignore.
977 * <p>
978 * This character is ignored when parsing the String, unless it is
979 * within a quoted region.
980 *
981 * @param ignored the ignored character to use
982 * @return this, to enable chaining
983 */
984 public StrTokenizer setIgnoredChar(char ignored) {
985 return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
986 }
987
988 // Trimmer
989 //-----------------------------------------------------------------------
990 /**
991 * Gets the trimmer character matcher.
992 * <p>
993 * These characters are trimmed off on each side of the delimiter
994 * until the token or quote is found.
995 * The default value is not to trim anything.
996 *
997 * @return the trimmer matcher in use
998 */
999 public StrMatcher getTrimmerMatcher() {
1000 return trimmerMatcher;
1001 }
1002
1003 /**
1004 * Sets the matcher for characters to trim.
1005 * <p>
1006 * These characters are trimmed off on each side of the delimiter
1007 * until the token or quote is found.
1008 *
1009 * @param trimmer the trimmer matcher to use, null ignored
1010 * @return this, to enable chaining
1011 */
1012 public StrTokenizer setTrimmerMatcher(StrMatcher trimmer) {
1013 if (trimmer != null) {
1014 this.trimmerMatcher = trimmer;
1015 }
1016 return this;
1017 }
1018
1019 //-----------------------------------------------------------------------
1020 /**
1021 * Gets whether the tokenizer currently returns empty tokens as null.
1022 * The default for this property is false.
1023 *
1024 * @return true if empty tokens are returned as null
1025 */
1026 public boolean isEmptyTokenAsNull() {
1027 return this.emptyAsNull;
1028 }
1029
1030 /**
1031 * Sets whether the tokenizer should return empty tokens as null.
1032 * The default for this property is false.
1033 *
1034 * @param emptyAsNull whether empty tokens are returned as null
1035 * @return this, to enable chaining
1036 */
1037 public StrTokenizer setEmptyTokenAsNull(boolean emptyAsNull) {
1038 this.emptyAsNull = emptyAsNull;
1039 return this;
1040 }
1041
1042 //-----------------------------------------------------------------------
1043 /**
1044 * Gets whether the tokenizer currently ignores empty tokens.
1045 * The default for this property is true.
1046 *
1047 * @return true if empty tokens are not returned
1048 */
1049 public boolean isIgnoreEmptyTokens() {
1050 return ignoreEmptyTokens;
1051 }
1052
1053 /**
1054 * Sets whether the tokenizer should ignore and not return empty tokens.
1055 * The default for this property is true.
1056 *
1057 * @param ignoreEmptyTokens whether empty tokens are not returned
1058 * @return this, to enable chaining
1059 */
1060 public StrTokenizer setIgnoreEmptyTokens(boolean ignoreEmptyTokens) {
1061 this.ignoreEmptyTokens = ignoreEmptyTokens;
1062 return this;
1063 }
1064
1065 //-----------------------------------------------------------------------
1066 /**
1067 * Gets the String content that the tokenizer is parsing.
1068 *
1069 * @return the string content being parsed
1070 */
1071 public String getContent() {
1072 if (chars == null) {
1073 return null;
1074 }
1075 return new String(chars);
1076 }
1077
1078 //-----------------------------------------------------------------------
1079 /**
1080 * Creates a new instance of this Tokenizer. The new instance is reset so
1081 * that it will be at the start of the token list.
1082 * If a {@link CloneNotSupportedException} is caught, return <code>null</code>.
1083 *
1084 * @return a new instance of this Tokenizer which has been reset.
1085 */
1086 public Object clone() {
1087 try {
1088 return cloneReset();
1089 } catch (CloneNotSupportedException ex) {
1090 return null;
1091 }
1092 }
1093
1094 /**
1095 * Creates a new instance of this Tokenizer. The new instance is reset so that
1096 * it will be at the start of the token list.
1097 *
1098 * @return a new instance of this Tokenizer which has been reset.
1099 * @throws CloneNotSupportedException if there is a problem cloning
1100 */
1101 Object cloneReset() throws CloneNotSupportedException {
1102 // this method exists to enable 100% test coverage
1103 StrTokenizer cloned = (StrTokenizer) super.clone();
1104 if (cloned.chars != null) {
1105 cloned.chars = (char[]) cloned.chars.clone();
1106 }
1107 cloned.reset();
1108 return cloned;
1109 }
1110
1111 //-----------------------------------------------------------------------
1112 /**
1113 * Gets the String content that the tokenizer is parsing.
1114 *
1115 * @return the string content being parsed
1116 */
1117 public String toString() {
1118 if (tokens == null) {
1119 return "StrTokenizer[not tokenized yet]";
1120 }
1121 return "StrTokenizer" + getTokenList();
1122 }
1123
1124 }