org.apache.commons.codec.language.bm java code examples

/**
 * Guesses the language of a word.
 *
 * @param text
 *            the word
 * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
 */
public String guessLanguage(final String text) {
  final Languages.LanguageSet ls = guessLanguages(text);
  return ls.isSingleton() ? ls.getAny() : Languages.ANY;
}

/**
 * An empty builder where all phonemes must come from some set of languages. This will contain a single
 * phoneme of zero characters. This can then be appended to. This should be the only way to create a new
 * phoneme from scratch.
 *
 * @param languages the set of languages
 * @return  a new, empty phoneme builder
 */
public static PhonemeBuilder empty(final Languages.LanguageSet languages) {
  return new PhonemeBuilder(new Rule.Phoneme("", languages));
}

/**
 * Encodes a string to its phonetic representation.
 *
 * @param input
 *            the String to encode
 * @return the encoding of the input
 */
public String encode(final String input) {
  final Languages.LanguageSet languageSet = this.lang.guessLanguages(input);
  return encode(input, languageSet);
}

/**
 * Sets the rule type to apply. This will widen or narrow the range of phonetic encodings considered.
 *
 * @param ruleType
 *            {@link RuleType#APPROX} or {@link RuleType#EXACT} for approximate or exact phonetic matches
 */
public void setRuleType(final RuleType ruleType) {
  this.engine = new PhoneticEngine(this.engine.getNameType(),
                   ruleType,
                   this.engine.isConcat(),
                   this.engine.getMaxPhonemes());
}

/**
 * Sets the type of name. Use {@link NameType#GENERIC} unless you specifically want phonetic encodings
 * optimized for Ashkenazi or Sephardic Jewish family names.
 *
 * @param nameType
 *            the NameType in use
 */
public void setNameType(final NameType nameType) {
  this.engine = new PhoneticEngine(nameType,
                   this.engine.getRuleType(),
                   this.engine.isConcat(),
                   this.engine.getMaxPhonemes());
}

/**
 * Sets the number of maximum of phonemes that shall be considered by the engine.
 *
 * @param maxPhonemes
 *            the maximum number of phonemes returned by the engine
 * @since 1.7
 */
public void setMaxPhonemes(final int maxPhonemes) {
  this.engine = new PhoneticEngine(this.engine.getNameType(),
                   this.engine.getRuleType(),
                   this.engine.isConcat(),
                   maxPhonemes);
}

/**
 * Sets how multiple possible phonetic encodings are combined.
 *
 * @param concat
 *            true if multiple encodings are to be combined with a '|', false if just the first one is
 *            to be considered
 */
public void setConcat(final boolean concat) {
  this.engine = new PhoneticEngine(this.engine.getNameType(),
                   this.engine.getRuleType(),
                   concat,
                   this.engine.getMaxPhonemes());
}

@Test(expected = IndexOutOfBoundsException.class)
public void testNegativeIndexForRuleMatchIndexOutOfBoundsException() {
  final Rule r = new Rule("a", "", "", new Rule.Phoneme("", Languages.ANY_LANGUAGE));
  r.patternAndContextMatches("bob", -1);
}

/**
 * Returns a new Phoneme with the same text but a union of its
 * current language set and the given one.
 *
 * @param lang the language set to merge
 * @return a new Phoneme
 */
public Phoneme mergeWithLanguage(final LanguageSet lang) {
 return new Phoneme(this.phonemeText.toString(), this.languages.merge(lang));
}

private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
  return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
             nameType.getName(), rt.getName(), lang);
}

/**
 * Tests https://issues.apache.org/jira/browse/CODEC-125?focusedCommentId=13071566&page=com.atlassian.jira.plugin.system.issuetabpanels:
 * comment-tabpanel#comment-13071566
 *
 * @throws EncoderException
 */
@Test
public void testEncodeGna() throws EncoderException {
  final BeiderMorseEncoder bmpm = createGenericApproxEncoder();
  bmpm.encode("gna");
}

@Override
public String encode(final String source) throws EncoderException {
  if (source == null) {
    return null;
  }
  return this.engine.encode(source);
}

/**
 * Gets the name type currently in operation.
 *
 * @return the NameType currently being used
 */
public NameType getNameType() {
  return this.engine.getNameType();
}

/**
 * Gets the rule type currently in operation.
 *
 * @return the RuleType currently being used
 */
public RuleType getRuleType() {
  return this.engine.getRuleType();
}

/**
 * Creates a new phoneme builder containing all phonemes in this one extended by <code>str</code>.
 *
 * @param str   the characters to append to the phonemes
 */
public void append(final CharSequence str) {
  for (final Rule.Phoneme ph : this.phonemes) {
    ph.append(str);
  }
}

/**
 * Discovers if multiple possible encodings are concatenated.
 *
 * @return true if multiple encodings are concatenated, false if just the first one is returned
 */
public boolean isConcat() {
  return this.engine.isConcat();
}

private BeiderMorseEncoder createGenericApproxEncoder() {
  final BeiderMorseEncoder encoder = new BeiderMorseEncoder();
  encoder.setNameType(NameType.GENERIC);
  encoder.setRuleType(RuleType.APPROX);
  return encoder;
}

@Test(expected = IllegalStateException.class)
public void testInvalidLangIllegalStateException() {
  Lang.loadFromResource("thisIsAMadeUpResourceName", Languages.getInstance(NameType.GENERIC));
}

@Override
protected StringEncoder createStringEncoder() {
  return new BeiderMorseEncoder();
}

@Test(timeout = 10000L)
public void testLongestEnglishSurname() throws EncoderException {
  final BeiderMorseEncoder bmpm = createGenericApproxEncoder();
  bmpm.encode("MacGhilleseatheanaich");
}

Most used classes

PhoneticEngine
Converts words into potential phonetic representations. This is a two-stage process. Firstly, the wo
BeiderMorseEncoder
Encodes strings into their Beider-Morse phonetic encoding. Beider-Morse phonetic encodings are optim
Lang
Language guessing utility. This class encapsulates rules used to guess the possible languages that a
Languages$LanguageSet
A set of languages.
Languages
Language codes. Language codes are typically loaded from resource files. These are UTF-8 encoded tex

How to use org.apache.commons.codec.language.bm

Best Java code snippets using org.apache.commons.codec.language.bm (Showing top 20 results out of 315)