The-csv-tokenizer-class-allows-an-application-to-break-a-Comma-Separated-Value-format-into-tokens


            /**
 * 
 * JFreeReport : a free Java reporting library
 * 
 *
 * Project Info:  http://reporting.pentaho.org/
 *
 * (C) Copyright 2001-2007, by Object Refinery Ltd, Pentaho Corporation and Contributors.
 *
 * This library is free software; you can redistribute it and/or modify it under the terms
 * of the GNU Lesser General Public License as published by the Free Software Foundation;
 * either version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License along with this
 * library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 * [Java is a trademark or registered trademark of Sun Microsystems, Inc.
 * in the United States and other countries.]
 *
 * ------------
 * CSVTokenizer.java
 * ------------
 * (C) Copyright 2001-2007, by Object Refinery Ltd, Pentaho Corporation and Contributors.
 */
import java.util.Enumeration;
import java.util.NoSuchElementException;
/**
 * The csv tokenizer class allows an application to break a Comma Separated Value format into tokens. The tokenization
 * method is much simpler than the one used by the StringTokenizer class. The CSVTokenizer
 * methods do not distinguish among identifiers, numbers, and quoted strings, nor do they recognize and skip comments.
 * 
 * The set of separator (the characters that separate tokens) may be specified either at creation time or on a per-token
 * basis.
 * 

 * An instance of CSVTokenizer behaves in one of two ways, depending on whether it was created with the
 * returnSeparators flag having the value true or false: 
 If the flag is
 * false, delimiter characters serve to separate tokens. A token is a maximal sequence of consecutive
 * characters that are not separator. 
If the flag is true, delimiter characters are themselves
 * considered to be tokens. A token is thus either one delimiter character, or a maximal sequence of consecutive
 * characters that are not separator. 
 A CSVTokenizer object internally maintains a current position
 * within the string to be tokenized. Some operations advance this current position past the characters processed.
 A
 * token is returned by taking a substring of the string that was used to create the CSVTokenizer object.
 * 

 * The following is one example of the use of the tokenizer. The code:
 * 

 *     CSVTokenizer csvt = new CSVTokenizer("this,is,a,test");
 *     while (csvt.hasMoreTokens()) {
 *         println(csvt.nextToken());
 *     }
 * 

 * 
 * prints the following output:
 * 

 *     this
 *     is
 *     a
 *     test
 * 

 *
 * @author abupon
 */
public class CSVTokenizer implements Enumeration
{
  /**
   * The complete record that should be separated into elements.
   */
  private String record;
  /**
   * The separator.
   */
  private String separator;
  /**
   * The quoting char.
   */
  private String quate;
  /**
   * the current parsing position.
   */
  private int currentIndex;
  /**
   * A flag indicating that the current parse position is before the start.
   */
  private boolean beforeStart;
  /**
   * A possible separator constant.
   */
  public static final String SEPARATOR_COMMA = ",";
  /**
   * A possible separator constant.
   */
  public static final String SEPARATOR_TAB = "\t";
  /**
   * A possible separator constant.
   */
  public static final String SEPARATOR_SPACE = " ";
  /**
   * A possible quote character constant.
   */
  public static final String DOUBLE_QUATE = "\"";
  /**
   * A possible quote character constant.
   */
  public static final String SINGLE_QUATE = "'";
  /**
   * Constructs a csv tokenizer for the specified string. theSeparator argument is the separator for
   * separating tokens.
   *

* If the returnSeparators flag is true, then the separator string is also returned as * tokens. separator is returned as a string. If the flag is false, the separator string is skipped and * only serve as separator between tokens. * * @param aString a string to be parsed. * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.). * @param theQuate the quate (CSVTokenizer.SINGLE_QUATE, CSVTokenizer.DOUBLE_QUATE, etc.). */ public CSVTokenizer(final String aString, final String theSeparator, final String theQuate) { if (aString == null) { throw new NullPointerException("The given string is null"); } if (theSeparator == null) { throw new NullPointerException("The given separator is null"); } if (theQuate == null) { throw new NullPointerException("The given quate is null"); } this.record = aString.trim(); this.separator = theSeparator; this.quate = theQuate; this.currentIndex = 0; this.beforeStart = true; } /** * Constructs a csv tokenizer for the specified string. The characters in the theSeparator argument are * the separator for separating tokens. Separator string themselves will not be treated as tokens. * * @param aString a string to be parsed. * @param theSeparator the separator (CSVTokenizer.SEPARATOR_COMMA, CSVTokenizer.TAB, CSVTokenizer.SPACE, etc.). */ public CSVTokenizer(final String aString, final String theSeparator) { this(aString, theSeparator, CSVTokenizer.DOUBLE_QUATE); } /** * Constructs a string tokenizer for the specified string. The tokenizer uses the default separator set, which is * CSVTokenizer.SEPARATOR_COMMA. Separator string themselves will not be treated as tokens. * * @param aString a string to be parsed. */ public CSVTokenizer(final String aString) { this(aString, CSVTokenizer.SEPARATOR_COMMA); } /** * Tests if there are more tokens available from this tokenizer's string. If this method returns true, then a * subsequent call to nextToken with no argument will successfully return a token. * * @return true if and only if there is at least one token in the string after the current position; * false otherwise. */ public boolean hasMoreTokens() { return (this.currentIndex < this.record.length()); } /** * Returns the next token from this string tokenizer. * * @return the next token from this string tokenizer. * @throws NoSuchElementException if there are no more tokens in this tokenizer's string. * @throws IllegalArgumentException if given parameter string format was wrong */ public String nextToken() throws NoSuchElementException, IllegalArgumentException { if (!this.hasMoreTokens()) { throw new NoSuchElementException(); } if (beforeStart == false) { currentIndex += this.separator.length(); } else { beforeStart = false; } if (this.record.startsWith(this.quate, this.currentIndex)) { final StringBuffer token = new StringBuffer(); String rec = this.record.substring(this.currentIndex + this.quate.length()); while (true) { final int end = rec.indexOf(this.quate); if (end < 0) { throw new IllegalArgumentException("Illegal format"); } if (!rec.startsWith(this.quate, end + 1)) { token.append(rec.substring(0, end)); break; } token.append(rec.substring(0, end + 1)); rec = rec.substring(end + this.quate.length() * 2); this.currentIndex++; } this.currentIndex += (token.length() + this.quate.length() * 2); return token.toString(); } final int end = this.record.indexOf(this.separator, this.currentIndex); if (end >= 0) { final int start = this.currentIndex; final String token = this.record.substring(start, end); this.currentIndex = end; return token; } else { final int start = this.currentIndex; final String token = this.record.substring(start); this.currentIndex = this.record.length(); return token; } } /** * Returns the next token in this string tokenizer's string. First, the set of characters considered to be separator * by this CSVTokenizer object is changed to be the characters in the string separator. Then the * next token in the string after the current position is returned. The current position is advanced beyond the * recognized token. The new delimiter set remains the default after this call. * * @param theSeparator the new separator. * @return the next token, after switching to the new delimiter set. * @throws java.util.NoSuchElementException * if there are no more tokens in this tokenizer's string. */ public String nextToken(final String theSeparator) { separator = theSeparator; return nextToken(); } /** * Returns the same value as the hasMoreTokens method. It exists so that this class can implement the * Enumeration interface. * * @return true if there are more tokens; false otherwise. * @see java.util.Enumeration * @see org.jfree.report.util.CSVTokenizer#hasMoreTokens() */ public boolean hasMoreElements() { return hasMoreTokens(); } /** * Returns the same value as the nextToken method, except that its declared return value is * Object rather than String. It exists so that this class can implement the * Enumeration interface. * * @return the next token in the string. * @throws java.util.NoSuchElementException * if there are no more tokens in this tokenizer's string. * @see java.util.Enumeration * @see org.jfree.report.util.CSVTokenizer#nextToken() */ public Object nextElement() { return nextToken(); } /** * Calculates the number of times that this tokenizer's nextToken method can be called before it * generates an exception. The current position is not advanced. * * @return the number of tokens remaining in the string using the current delimiter set. * @see org.jfree.report.util.CSVTokenizer#nextToken() */ public int countTokens() { int count = 0; final int preserve = this.currentIndex; final boolean preserveStart = this.beforeStart; while (this.hasMoreTokens()) { this.nextToken(); count++; } this.currentIndex = preserve; this.beforeStart = preserveStart; return count; } /** * Returns the quate. * * @return char */ public String getQuate() { return this.quate; } /** * Sets the quate. * * @param quate The quate to set */ public void setQuate(final String quate) { this.quate = quate; } }

Development Class Java