/*
* LingPipe v. 3.9
* Copyright (C) 2003-2010 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
//package com.aliasi.util;
import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.AbstractSet;
import java.util.Collection;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.Serializable;
/**
* A {@code CompactHashSet} implements the set interface more tightly in
* memory and more efficiently than Java's {@link java.util.HashSet}.
*
* Sizing and Resizing
*
* This hash set allows arbitrary capacities sized hash sets to be created,
* including hash sets of size 0. When resizing is necessary due to
* objects being added, it resizes to the next largest capacity that is
* at least 1.5 times
*
What's wrong with HashSet
?
*
* Java's hash set implementation {@link java.util.HashSet} wraps a
* {@link java.util.HashMap}, requiring a map entry with a dummy value
* for each set entry. Map entries are even heavier as the map
* entries also contain a next field to implement a linked list
* structure to deal with collisions. This class represents hash set
* entries in terms of the entries themselves. Iterators are based
* directly on the underlying hash table.
*
* Java's hash set implementation requires hash tables with
* capacities that are even powers of 2. This allows a very quick
* modulus operation on hash codes by using a mask, but restricts
* the size of hash sets to be powers of 2.
*
*
Implementation
*
* The implementation in this class uses open addressing, which uses a
* simple array to store all of the set elements based on their hash
* codes. Hash codes are taken modulo the capacity of the hash set,
* so capacities above a certain small level are rounded up so that
* they are not powers of 2.
*
* The current implementation uses linear probing with a step size
* of 1 for the case of hash code collisions. This means that if the
* position for an entry is full, the next position is considered
* (wrapping to the beginning at the end of the array).
*
*
We borrowed the supplemental hashing function from
* {@link java.util.HashMap} (version ID 1.73). If the
* initial hashFunction is h
the supplemental
* hash function is computed by:
*
*
* static int supplementalHash(int n) {
* int n2 = n ^ (n >>> 20) ^ (n >>> 12);
* return n2 ^ (n2 >>> 7) ^ (n2 >>> 4);
* }
*
* This is required to scramble the hash code of strings
* that share the same prefix with a different final character from
* being adjacent. Recall that the hash code for strings s
* consisting of characters s[0], ..., s[n-1]
is
* defined in {@link String#hashCode()} to be:
*
*
* s[0]*31^(n-1) + s[1]*31^(n-2) + ... + s[n-1]
*
* Null Elements
*
* Attempts to add, remove, or test null objects for membership will
* throw null pointer exceptions.
*
* Illegal Types and Class Casts
*
* When adding, removing, or checking for membership, the
* {@link Object#equals(Object)} method may throw class cast exceptions
* if the specified object is not of a type comparable to the elements
* of the set.
*
* Resizing
*
* As more elements are added to the set, it will automatically resize
* its underlying array of buckets.
*
* As of this release, sets will never be resized downward.
*
*
Equality and Hash Codes
*
* Compact hash sets satisfy the specification of the
* the equality metho, {@link #equals(Object)}, and hash code
* method {@link #hashCode()} specified by the {@link java.util.Set} interface.
*
* The implementations are inherited from the superclass
* {@link AbstractSet}, which uses the underlying iterators.
*
*
Concurrent Modification
*
* This set implementation does not support concurrent
* modification. (Note that this does not exclusively have to do with
* threading, though multiple threads may lead to concurrent
* modifications.) If the set is modified during an iteration, or
* conversion to array, its behavior will be unpredictable, though it
* is likely to make errors in basic methods and to throw an array
* index or null pointer exception.
*
* Thread Safety
*
* This class is only thread safe with read-write locking.
* Any number of read operations may be executed concurrently,
* but writes must be executed exclusively. The write
* operations include any method that may potentially change
* the set.
*
* Serializability
*
* A small hash set is serializable if all of its elements are
* serializable. The deserialized object will be an instance of this
* class.
*
* References
*
*
* - Wikipedia: Open Addressing
* - Wikipedia: Linear Hash Probing
*
*
* @author Bob Carpenter
* @version 3.9.1
* @since LingPipe3.9.1
* @param the type of element stored in the set
*/
public class CompactHashSet
extends AbstractSet
implements Serializable {
static final long serialVersionUID = -2524057065260042957L;
private E[] mBuckets;
private int mSize = 0;
/**
* Construct a compact hash set with the specified initial
* capacity.
*
* @throws IllegalArgumentException If the capacity is less than 1.
* @throws OutOfMemoryException If the initial capacity is too
* large for the JVM.
*/
public CompactHashSet(int initialCapacity) {
if (initialCapacity < 1) {
String msg = "Capacity must be positive."
+ " Found initialCapacity=" + initialCapacity;
throw new IllegalArgumentException(msg);
}
alloc(initialCapacity);
}
/**
* Construct a compact hash set containing the specified list
* of values. It begins with a set of initial capacity 1.
*
* @param es Initial values to add to set.
*/
public CompactHashSet(E... es) {
this(1);
for (E e : es)
add(e);
}
/**
* Add the specified object to the set if it is not already
* present, returning {@code} true if the set didn't already
* contain the element. If the set already contains the
* object, there is no effect.
*
* @param e Object to add to set.
* @return {@code true} if the set didn't already contain the
* element.
* @throws NullPointerException If the specified element is {@code
* null}.
* @throws ClassCastException If the class of this object prevents
* it from being added to the set.
*/
public boolean add(E e) {
if (e == null) {
String msg = "Cannot add null to CompactHashSet";
throw new NullPointerException(msg);
}
int slot = findSlot(e);
if (mBuckets[slot] != null)
return false;
if ((mSize + 1) >= (LOAD_FACTOR * mBuckets.length)) {
realloc();
slot = findSlot(e);
if (mBuckets[slot] != null)
throw new IllegalStateException("");
}
mBuckets[slot] = e;
++mSize;
return true;
}
/**
* Remove all of the elements from this set.
*
* Note that this operation does not affect the
* underlying capacity of the set itself.
*/
@Override
public void clear() {
alloc(1);
}
/**
* Returns {@code true} if this set contains the specified object.
*
* @param o Object to test.
* @return {@code true} if this set contains the specified object.
* @throws NullPointerException If the specified object is null.
* @throws ClassCastException If the type of this object is incompatible
* with this set.
*/
@Override
public boolean contains(Object o) {
if (o == null) {
String msg = "Compact hash sets do not support null objects.";
throw new NullPointerException(msg);
}
Object o2 = mBuckets[findSlot(o)];
return o2 != null && o.equals(o2);
}
/**
* Returns an iterator over the elements in this set. The
* iterator supports the {@link Iterator#remove()} operation,
* which is not considered a concurrent modification.
*
*
Iteration order for sets is not guaranteed to be
* stable under adds or deletes, or for sets of different
* capacities containing the same elements.
*
*
The set must not be modified by other operations
* while an iteration is in process. Doing so may cause
* an illegal state and unpredictable behavior such as
* null pointer or array index exceptions.
*
* @return Iterator over the set elements.
*/
@Override
public Iterator iterator() {
return new BucketIterator();
}
/**
* Removes the specified object from the set, returning {@code true}
* if the object was present before the remove.
*
* @param o Object to remove.
* @return {@code true} if the object was present before the remove
* operation.
* @throws ClassCastException If the specified object is not compatible
* with this set.
*/
@Override
public boolean remove(Object o) {
if (o == null)
return false;
@SuppressWarnings("unchecked") // except if doesn't work
int slot = findSlot((E)o);
if (mBuckets[slot] == null)
return false;
mBuckets[slot] = null;
tampCollisions(slot);
--mSize;
return true;
}
// mBuckets[index] == null
void tampCollisions(int index) {
for (int i = nextIndex(index) ; mBuckets[i] != null; i = nextIndex(i)) {
int slot = findSlot(mBuckets[i]);
if (slot != i) {
mBuckets[slot] = mBuckets[i];
mBuckets[i] = null;
}
}
}
/**
* Removes all the elements of the specified collection from this
* set, returning {@code true} if the set was modified as a
* result.
*
* Implementation Note: Unlike the implementation the
* parent class {@link AbstractSet} inherits from its
* parent class {@link java.util.AbstractCollection}, this implementation
* iterates over the argument collection, removing each of its
* elements.
*
* @param collection Collection of objects to remove.
* @return {@code true} if the set was modified as a result.
* @throws NullPointerException If any of the collection members is null, or
* if the specified collection is itself null.
* @throws ClassCastException If attempting to remove a member of the
* collection results in a cast exception.
*/
@Override
public boolean removeAll(Collection> collection) {
boolean modified = false;
for (Object o : collection)
if (remove(o))
modified = true;
return modified;
}
/**
* Remove all elements from this set that are not members of the
* specified collection, returning {@code true} if this set was
* modified as a result of the operation.
*
*
Implementation Note: Unlike the implementation that
* the parent class {@link AbstractSet} inherits from {@link
* java.util.AbstractCollection}, this implementation directly
* visits the underlying hash entries rather than invoking the
* overhead of an iterator.
*
* @param collection Collection of objects to retain.
* @return {@code true} if this set was modified as a result of
* the operation.
* @throws ClassCastException If comparing elements of the
* specified collection to elements of this set throws a class
* cast exception.
* @throws NullPointerException If the specified collection is
* {@code null}.
*
*/
@Override
public boolean retainAll(Collection> collection) {
boolean modified = false;
for (int i = 0; i < mBuckets.length; ++i) {
if (mBuckets[i] != null && collection.contains(mBuckets[i])) {
modified = true;
mBuckets[i] = null;
tampCollisions(i);
--mSize;
}
}
return modified;
}
/**
* Returns the number of objects in this set.
*
* @return The number of objects in this set.
*/
@Override
public int size() {
return mSize;
}
/**
* Returns an object array containing all of the members of this
* set. The order of elements in the array is the iteration
* order, but this is not guaranteed to be stable under
* modifications or changes in capacity.
*
*
The returned array is fresh and may be modified without
* affect this set.
*
* @return Array of objects in this set.
*/
@Override
public Object[] toArray() {
Object[] result = new Object[mSize];
int nextIndex = 0;
for (int i = 0; i < mBuckets.length; ++i)
if (mBuckets[i] != null)
result[nextIndex++] = mBuckets[i];
return result;
}
/**
* Returns an array of elements contained in this set, the runtime type
* of which is that of the specified array argument.
*
*
If the specified array argument is long enough to hold all
* of the elements in this set, it will be filled starting from
* index 0. If the specified array is longer than the size of the
* set, the array entry following the last entry filled by this
* set will be set to {@code null}.
*
*
If the specified array is not long enough to hold all of
* the elements, a new array will be created of the appropriate
* type through reflection.
*
* @param array Array of values determining type of output and containing
* output elements if long enough.
* @param Type of target array.
* @throws ArrayStoreException If the members of this set cannot be
* inserted into an array of the specified type.
* @throws NullPointerException If the specified array is null.
*/
@Override
public T[] toArray(T[] array) {
// construction from java.util.AbstractCollection
@SuppressWarnings("unchecked")
T[] result
= array.length >= mSize
? array
: (T[]) Array.newInstance(array.getClass().getComponentType(),
mSize);
int nextIndex = 0;
for (int i = 0; i < mBuckets.length; ++i) {
if (mBuckets[i] != null) {
@SuppressWarnings("unchecked") // may bomb at run time according to spec
T next = (T) mBuckets[i];
result[nextIndex++] = next;
}
}
if (result.length > mSize)
result[mSize] = null; // req for interface
return result;
}
int findSlot(Object e) {
for (int i = firstIndex(e); ; i = nextIndex(i)) {
if (mBuckets[i] == null)
return i;
if (mBuckets[i].equals(e))
return i;
}
}
int firstIndex(Object e) {
return java.lang.Math.abs(supplementalHash(e.hashCode())) % mBuckets.length;
}
int nextIndex(int index) {
return (index + 1) % mBuckets.length;
}
void alloc(int capacity) {
if (capacity < 0) {
String msg = "Capacity must be non-negative."
+ " Found capacity=" + capacity;
throw new IllegalArgumentException(msg);
}
@SuppressWarnings("unchecked") // need for generic array
E[] buckets = (E[]) new Object[capacity];
mBuckets = buckets;
mSize = 0;
}
void realloc() {
E[] oldBuckets = mBuckets;
long capacity = java.lang.Math.max((long) (RESIZE_FACTOR * mBuckets.length),
mBuckets.length + 1);
if (capacity > Integer.MAX_VALUE) {
String msg = "Not enough room to resize."
+ " Last capacity=" + mBuckets.length
+ " Failed New capacity=" + capacity;
throw new IllegalArgumentException(msg);
}
alloc((int)capacity);
for (int i = 0; i < oldBuckets.length; ++i)
if (oldBuckets[i] != null)
add(oldBuckets[i]);
}
class BucketIterator implements Iterator {
int mNextBucket = 0;
int mRemoveIndex = -1;
public boolean hasNext() {
for ( ; mNextBucket < mBuckets.length; ++mNextBucket)
if (mBuckets[mNextBucket] != null)
return true;
return false;
}
public E next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
mRemoveIndex = mNextBucket++;
return mBuckets[mRemoveIndex];
}
public void remove() {
if (mRemoveIndex == -1)
throw new IllegalStateException();
mBuckets[mRemoveIndex] = null;
--mSize;
mRemoveIndex = -1;
}
}
static final float LOAD_FACTOR = 0.75f;
static final float RESIZE_FACTOR = 1.5f;
// function recoded from the Sun JDK's java.util.HashMap version
// 1.73
static int supplementalHash(int n) {
int n2 = n ^ (n >>> 20) ^ (n >>> 12);
return n2 ^ (n2 >>> 7) ^ (n2 >>> 4);
}
}