com.ibm.icu.text.BreakIterator java code examples

  private List<String> getAllWords(final String preparedString) {
    final List<String> words = new ArrayList<>();
    final BreakIterator breakIterator = BreakIterator.getWordInstance();
    breakIterator.setText(preparedString);
    int start = breakIterator.first();

    for (int end = breakIterator.next(); end != BreakIterator.DONE; start = end, end = breakIterator.next()) {
      words.add(preparedString.substring(start, end));
    }
    return words;
  }
}

@Override
public int following(int offset) {
  return fIcuBrkItr.following(offset);
}

public ICULineBreaker(Locale locale) {
  this.breaker = BreakIterator.getLineInstance(locale);
}

/**
 * Creates a new break iterator.
 */
public JavaBreakIterator() {
  fIterator = BreakIterator.getWordInstance();
  fIndex = fIterator.current();
}

@Override
int next() {
 int current = bi.current();
 int next = bi.next();
 status = calcStatus(current, next);
 return next;
}

@Override
protected IRegion findWord(IDocument document, int offset) {
  try {
    IRegion line = document.getLineInformationOfOffset(offset);
    if (offset == line.getOffset() + line.getLength())
      return null;
    com.ibm.icu.text.BreakIterator breakIter = createBreakIterator();
    breakIter.setText(new DocumentCharacterIterator(document));
    int start = breakIter.preceding(offset);
    if (start == BreakIterator.DONE)
      start = line.getOffset();
    int end = breakIter.following(offset);
    if (end == BreakIterator.DONE)
      end = line.getOffset() + line.getLength();
    if (breakIter.isBoundary(offset)) {
      if (end - offset > offset - start)
        start = offset;
      else
        end = offset;
    }
    if (end == start)
      return null;
    return new Region(start, end - start);
  } catch (BadLocationException e) {
    return null;
  }
}

fWordIterator.setText(content);
fPredecessor= fWordIterator.first();
fSuccessor= fWordIterator.next();
final BreakIterator iterator= BreakIterator.getSentenceInstance(locale);
iterator.setText(content);
int offset= iterator.current();
while (offset != BreakIterator.DONE) {
  offset= iterator.next();

  private boolean matchesSegments(String id) {
    BreakIterator iter = BreakIterator.getWordInstance();
    iter.setText(id);
    int i = iter.first();
    while (i != java.text.BreakIterator.DONE && i < id.length()) {
      int j = iter.following(i);
      if (j == java.text.BreakIterator.DONE) {
        j = id.length();
      }
      // match the word
      if (Character.isLetterOrDigit(id.charAt(i))) {
        String word = id.substring(i, j);
        if (matches(word))
          return true;
      }
      i = j;
    }
    return false;
  }
}

private void computeTextFragments(GC gc) {
  if (textFragments != null)
    return;
  ArrayList<TextFragment> list = new ArrayList<>();
  BreakIterator wb = BreakIterator.getLineInstance();
  wb.setText(getText());
  int cursor = 0;
  for (int loc = wb.first(); loc != BreakIterator.DONE; loc = wb.next()) {
    if (loc == 0)
      continue;
    String word = text.substring(cursor, loc);
    Point extent = gc.textExtent(word);
    list.add(new TextFragment((short) loc, (short) extent.x));
    cursor = loc;
  }
  textFragments = list.toArray(new TextFragment[list
      .size()]);
}

private String retrieveText(IJavaElement elem) throws JavaModelException {
  if (elem instanceof IMember) {
    Reader reader= JavadocContentAccess.getHTMLContentReader((IMember)elem, true, true);
    if (reader != null)
      reader= new HTML2TextReader(reader, null);
    if (reader != null) {
      String str= getString(reader);
      BreakIterator breakIterator= BreakIterator.getSentenceInstance();
      breakIterator.setText(str);
      return str.substring(0, breakIterator.next());
    }
  }
  return ""; //$NON-NLS-1$
}

  @Override
  public void setText(String newText) {
    this.breaker.setText(newText);
  }
}

@Override
public int next() {
  return this.breaker.next();
}

/**
 * Returns the locale specific word break iterator.
 * 
 * @return the locale specific word break iterator
 * @since 3.7
 */
private BreakIterator getWordBreakIterator() {
  if (fWordBreakIterator == null)
    fWordBreakIterator= BreakIterator.getWordInstance();
  return fWordBreakIterator;
}

  result = breakIterator.following(position);
} else {
  assert direction == Direction.Backwards;
  result = breakIterator.preceding(position);
    int ruleStatus = breakIterator.getRuleStatus();
    if (BreakIterator.WORD_NONE <= ruleStatus && ruleStatus < BreakIterator.WORD_NONE_LIMIT) {
      breakType = "none";
    int ruleStatus = breakIterator.getRuleStatus();
    if (LineBreakTag.SOFT <= ruleStatus && ruleStatus < LineBreakTag.SOFT_LIMIT) {
      breakType = "soft";
    int ruleStatus = breakIterator.getRuleStatus();
    if (SentenceBreakTag.TERM <= ruleStatus && ruleStatus < SentenceBreakTag.TERM_LIMIT) {
      breakType = "term";
iterator.setPosition(breakIterator.current());

@Override
public int first() {
  fIndex= fIterator.first();
  return fIndex;
}

/**
 * Returns a new instance of BreakIterator that locates sentence boundaries.
 * This function assumes the text being analyzed is in the default locale's
 * language.
 * @return A new instance of BreakIterator that locates sentence boundaries.
 * @stable ICU 2.0
 */
public static BreakIterator getSentenceInstance()
{
  return getSentenceInstance(Locale.getDefault());
}

@Override
public int preceding(int offset) {
  return fIcuBrkItr.preceding(offset);
}

@Override
public int current() {
  return fIcuBrkItr.current();
}

BreakIterator breakIter= BreakIterator.getWordInstance();
breakIter.setText(fDocIter);
int start= breakIter.preceding(position);
if (start == BreakIterator.DONE)
  start= line.getOffset();
int end= breakIter.following(position);
if (end == BreakIterator.DONE)
  end= line.getOffset() + line.getLength();
if (breakIter.isBoundary(position)) {
  if (end - position > position- start)
    start= position;

fWordIterator.setText(content);
fPredecessor= fWordIterator.first();
fSuccessor= fWordIterator.next();
final BreakIterator iterator= BreakIterator.getSentenceInstance(locale);
iterator.setText(content);
int offset= iterator.current();
while (offset != BreakIterator.DONE) {
  offset= iterator.next();

Javadoc

java.text.BreakIterator. _usage_

A class that locates boundaries in text. This class defines a protocol for objects that break up a piece of natural-language text according to a set of criteria. Instances or subclasses of BreakIterator can be provided, for example, to break a piece of text into words, sentences, or logical characters according to the conventions of some language or group of languages. We provide five built-in types of BreakIterator:

getTitleInstance() returns a BreakIterator that locates boundaries between title breaks.
getSentenceInstance() returns a BreakIterator that locates boundaries between sentences. This is useful for triple-click selection, for example.
getWordInstance() returns a BreakIterator that locates boundaries between words. This is useful for double-click selection or "find whole words" searches. This type of BreakIterator makes sure there is a boundary position at the beginning and end of each legal word. (Numbers count as words, too.) Whitespace and punctuation are kept separate from real words.
getLineInstance() returns a BreakIterator that locates positions where it is legal for a text editor to wrap lines. This is similar to word breaking, but not the same: punctuation and whitespace are generally kept with words (you don't want a line to start with whitespace, for example), and some special characters can force a position to be considered a line-break position or prevent a position from being a line-break position.
getCharacterInstance() returns a BreakIterator that locates boundaries between logical characters. Because of the structure of the Unicode encoding, a logical character may be stored internally as more than one Unicode code point. (A with an umlaut may be stored as an a followed by a separate combining umlaut character, for example, but the user still thinks of it as one character.) This iterator allows various processes (especially text editors) to treat as characters the units of text that a user would think of as characters, rather than the units of text that the computer sees as "characters".

The text boundary positions are found according to the rules described in Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex #14, Line Breaking Properties. These are available at http://www.unicode.org/reports/tr14/ and http://www.unicode.org/reports/tr29/.

BreakIterator's interface follows an "iterator" model (hence the name), meaning it has a concept of a "current position" and methods like first(), last(), next(), and previous() that update the current position. All BreakIterators uphold the following invariants:

The beginning and end of the text are always treated as boundary positions.
The current position of the iterator is always a boundary position (random- access methods move the iterator to the nearest boundary position before or after the specified position, not _to_ the specified position).
DONE is used as a flag to indicate when iteration has stopped. DONE is only returned when the current position is the end of the text and the user calls next(), or when the current position is the beginning of the text and the user calls previous().
Break positions are numbered by the positions of the characters that follow them. Thus, under normal circumstances, the position before the first character is 0, the position after the first character is 1, and the position after the last character is 1 plus the length of the string.
The client can change the position of an iterator, or the text it analyzes, at will, but cannot change the behavior. If the user wants different behavior, he must instantiate a new iterator.

BreakIterator accesses the text it analyzes through a CharacterIterator, which makes it possible to use BreakIterator to analyze text in any text-storage vehicle that provides a CharacterIterator interface. Note: Some types of BreakIterator can take a long time to create, and instances of BreakIterator are not currently cached by the system. For optimal performance, keep instances of BreakIterator around as long as makes sense. For example, when word-wrapping a document, don't create and destroy a new BreakIterator for each line. Create one break iterator for the whole document (or whatever stretch of text you're wrapping) and use it to do the whole job of wrapping the text.

Examples:

Creating and using text boundaries

 
public static void main(String args[]) { 
if (args.length == 1) { 
String stringToExamine = args[0]; 
//print each word in order 
BreakIterator boundary = BreakIterator.getWordInstance(); 
boundary.setText(stringToExamine); 
printEachForward(boundary, stringToExamine); 
//print each sentence in reverse order 
boundary = BreakIterator.getSentenceInstance(Locale.US); 
boundary.setText(stringToExamine); 
printEachBackward(boundary, stringToExamine); 
printFirst(boundary, stringToExamine); 
printLast(boundary, stringToExamine); 
} 
}

Print each element in order

 
public static void printEachForward(BreakIterator boundary, String source) { 
int start = boundary.first(); 
for (int end = boundary.next(); 
end != BreakIterator.DONE; 
start = end, end = boundary.next()) { 
System.out.println(source.substring(start,end)); 
} 
}

Print each element in reverse order

 
public static void printEachBackward(BreakIterator boundary, String source) { 
int end = boundary.last(); 
for (int start = boundary.previous(); 
start != BreakIterator.DONE; 
end = start, start = boundary.previous()) { 
System.out.println(source.substring(start,end)); 
} 
}

Print first element

 
public static void printFirst(BreakIterator boundary, String source) { 
int start = boundary.first(); 
int end = boundary.next(); 
System.out.println(source.substring(start,end)); 
}

Print last element

 
public static void printLast(BreakIterator boundary, String source) { 
int end = boundary.last(); 
int start = boundary.previous(); 
System.out.println(source.substring(start,end)); 
}

Print the element at a specified position

 
public static void printAt(BreakIterator boundary, int pos, String source) { 
int end = boundary.following(pos); 
int start = boundary.previous(); 
System.out.println(source.substring(start,end)); 
}

Find the next word

 
public static int nextWordStartAfter(int pos, String text) { 
BreakIterator wb = BreakIterator.getWordInstance(); 
wb.setText(text); 
int last = wb.following(pos); 
int current = wb.next(); 
while (current != BreakIterator.DONE) { 
for (int p = last; p < current; p++) { 
if (Character.isLetter(text.charAt(p))) 
return last; 
} 
last = current; 
current = wb.next(); 
} 
return BreakIterator.DONE; 
} 
(The iterator returned by BreakIterator.getWordInstance() is unique in that the break positions it returns don't represent both the start and end of the thing being iterated over. That is, a sentence-break iterator returns breaks that each represent the end of one sentence and the beginning of the next. With the word-break iterator, the characters between two boundaries might be a word, or they might be the punctuation or whitespace between two words. The above code uses a simple heuristic to determine which boundary is the beginning of a word: If the characters between this boundary and the next boundary include at least one letter (this can be an alphabetical letter, a CJK ideograph, a Hangul syllable, a Kana character, etc.), then the text between this boundary and the next is a word; otherwise, it's the material between words.)

Most used methods

setText
getWordInstance
Returns a new instance of BreakIterator that locates word boundaries.
next
Move the iterator by the specified number of steps in the text. A positive number moves the iterator
following
Sets the iterator's current iteration position to be the first boundary position following the speci
first
Set the iterator to the first boundary position. This is always the beginning index of the text this
getLineInstance
Returns a new instance of BreakIterator that locates legal line- wrapping positions.
current
Return the iterator's current position.
preceding
Sets the iterator's current iteration position to be the last boundary position preceding the specif
getSentenceInstance
Returns a new instance of BreakIterator that locates sentence boundaries.
getCharacterInstance
Returns a new instance of BreakIterator that locates logical-character boundaries.
getText
Returns a CharacterIterator over the text being analyzed. For at least some subclasses of BreakItera
last
Set the iterator to the last boundary position. This is always the "past-the-end" index of the text

Popular in Java

Parsing JSON documents to java classes using gson
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
notifyDataSetChanged (ArrayAdapter)
requestLocationUpdates (LocationManager)
BufferedInputStream (java.io)
A BufferedInputStream adds functionality to another input stream-namely, the ability to buffer the i
Vector (java.util)
Vector is an implementation of List, backed by an array and synchronized. All optional operations in
CountDownLatch (java.util.concurrent)
A synchronization aid that allows one or more threads to wait until a set of operations being perfor
HttpServlet (javax.servlet.http)
Provides an abstract class to be subclassed to create an HTTP servlet suitable for a Web site. A sub
IOUtils (org.apache.commons.io)
General IO stream manipulation utilities. This class provides static utility methods for input/outpu
Project (org.apache.tools.ant)
Central representation of an Ant project. This class defines an Ant project with all of its targets,
Top PhpStorm plugins

How to useBreakIterator in com.ibm.icu.text

Best Java code snippets using com.ibm.icu.text.BreakIterator (Showing top 20 results out of 315)

How to use
BreakIterator
in
com.ibm.icu.text