Count-the-number-of-chars-included-in-the-given-byte


            import java.io.File;
import java.io.FileFilter;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/*
 *  Licensed to the Apache Software Foundation (ASF) under one
 *  or more contributor license agreements.  See the NOTICE file
 *  distributed with this work for additional information
 *  regarding copyright ownership.  The ASF licenses this file
 *  to you under the Apache License, Version 2.0 (the
 *  "License"); you may not use this file except in compliance
 *  with the License.  You may obtain a copy of the License at
 *  
 *    http://www.apache.org/licenses/LICENSE-2.0
 *  
 *  Unless required by applicable law or agreed to in writing,
 *  software distributed under the License is distributed on an
 *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *  KIND, either express or implied.  See the License for the
 *  specific language governing permissions and limitations
 *  under the License. 
 *  
 */
/**
 * Various string manipulation methods that are more efficient then chaining
 * string operations: all is done in the same buffer without creating a bunch of
 * string objects.
 * 
 * @author Dungeon Project
 */
public class Main {
  private static final int UTF8_MULTI_BYTES_MASK = 0x0080;
  private static final int UTF8_TWO_BYTES_MASK = 0x00E0;
  private static final int UTF8_TWO_BYTES = 0x00C0;
  private static final int UTF8_THREE_BYTES_MASK = 0x00F0;
  private static final int UTF8_THREE_BYTES = 0x00E0;
  private static final int UTF8_FOUR_BYTES_MASK = 0x00F8;
  private static final int UTF8_FOUR_BYTES = 0x00F0;
  private static final int UTF8_FIVE_BYTES_MASK = 0x00FC;
  private static final int UTF8_FIVE_BYTES = 0x00F8;
  private static final int UTF8_SIX_BYTES_MASK = 0x00FE;
  private static final int UTF8_SIX_BYTES = 0x00FC;
  /**
   * Count the number of bytes needed to return an Unicode char. This can be
   * from 1 to 6.
   * 
   * @param bytes
   *            The bytes to read
   * @param pos
   *            Position to start counting. It must be a valid start of a
   *            encoded char !
   * @return The number of bytes to create a char, or -1 if the encoding is
   *         wrong. TODO : Should stop after the third byte, as a char is only
   *         2 bytes long.
   */
  public static final int countBytesPerChar( byte[] bytes, int pos )
  {
      if ( bytes == null )
      {
          return -1;
      }
      if ( ( bytes[pos] & UTF8_MULTI_BYTES_MASK ) == 0 )
      {
          return 1;
      }
      else if ( ( bytes[pos] & UTF8_TWO_BYTES_MASK ) == UTF8_TWO_BYTES )
      {
          return 2;
      }
      else if ( ( bytes[pos] & UTF8_THREE_BYTES_MASK ) == UTF8_THREE_BYTES )
      {
          return 3;
      }
      else if ( ( bytes[pos] & UTF8_FOUR_BYTES_MASK ) == UTF8_FOUR_BYTES )
      {
          return 4;
      }
      else if ( ( bytes[pos] & UTF8_FIVE_BYTES_MASK ) == UTF8_FIVE_BYTES )
      {
          return 5;
      }
      else if ( ( bytes[pos] & UTF8_SIX_BYTES_MASK ) == UTF8_SIX_BYTES )
      {
          return 6;
      }
      else
      {
          return -1;
      }
  }
  
  /**
   * Count the number of chars included in the given byte[].
   * 
   * @param bytes
   *            The byte array to decode
   * @return The number of char in the byte array
   */
  public static final int countChars( byte[] bytes )
  {
      if ( bytes == null )
      {
          return 0;
      }
      int nbChars = 0;
      int currentPos = 0;
      while ( currentPos < bytes.length )
      {
          currentPos += countBytesPerChar( bytes, currentPos );
          nbChars++;
      }
      return nbChars;
  }
  
}
Development Java Tutorial