libunibreak  4.3
Data Structures | Macros | Enumerations | Functions | Variables
linebreak.c File Reference
#include <assert.h>
#include <stddef.h>
#include <string.h>
#include "linebreak.h"
#include "linebreakdef.h"
Include dependency graph for linebreak.c:

Data Structures

struct  LineBreakPropertiesIndex
 Struct for the second-level index to the line breaking properties. More...
 

Macros

#define LINEBREAK_UNDEFINED   -1
 Special value used internally to indicate an undefined break result. More...
 
#define LINEBREAK_INDEX_SIZE   40
 Size of the second-level index to the line breaking properties. More...
 
#define ENDS_WITH(str, suffix)   ends_with((str), (suffix), sizeof(suffix) - 1)
 

Enumerations

enum  BreakAction {
  DIR_BRK, IND_BRK, CMI_BRK, CMP_BRK,
  PRH_BRK
}
 Enumeration of break actions. More...
 

Functions

static __inline int ends_with (const char *str, const char *suffix, unsigned suffixLen)
 Checks whether the str ends with suffix, which has length suffix_len. More...
 
void init_linebreak (void)
 Initializes the second-level index to the line breaking properties. More...
 
static const struct LineBreakPropertiesget_lb_prop_lang (const char *lang)
 Gets the language-specific line breaking properties. More...
 
static enum LineBreakClass get_char_lb_class (utf32_t ch, const struct LineBreakProperties *lbp)
 Gets the line breaking class of a character from a line breaking properties array. More...
 
static enum LineBreakClass get_char_lb_class_default (utf32_t ch)
 Gets the line breaking class of a character from the default line breaking properties array. More...
 
static enum LineBreakClass get_char_lb_class_lang (utf32_t ch, const struct LineBreakProperties *lbpLang)
 Gets the line breaking class of a character for a specific language. More...
 
static enum LineBreakClass resolve_lb_class (enum LineBreakClass lbc, const char *lang)
 Resolves the line breaking class for certain ambiguous or complicated characters. More...
 
static void treat_first_char (struct LineBreakContext *lbpCtx)
 Treats specially for the first character in a line. More...
 
static int get_lb_result_simple (struct LineBreakContext *lbpCtx)
 Tries telling the line break opportunity by simple rules. More...
 
static int get_lb_result_lookup (struct LineBreakContext *lbpCtx)
 Tells the line break opportunity by table lookup. More...
 
void lb_init_break_context (struct LineBreakContext *lbpCtx, utf32_t ch, const char *lang)
 Initializes line breaking context for a given language. More...
 
int lb_process_next_char (struct LineBreakContext *lbpCtx, utf32_t ch)
 Updates LineBreakingContext for the next codepoint and returns the detected break. More...
 
void set_linebreaks (const void *s, size_t len, const char *lang, char *brks, get_next_char_t get_next_char)
 Sets the line breaking information for a generic input string. More...
 
void set_linebreaks_utf8 (const utf8_t *s, size_t len, const char *lang, char *brks)
 Sets the line breaking information for a UTF-8 input string. More...
 
void set_linebreaks_utf16 (const utf16_t *s, size_t len, const char *lang, char *brks)
 Sets the line breaking information for a UTF-16 input string. More...
 
void set_linebreaks_utf32 (const utf32_t *s, size_t len, const char *lang, char *brks)
 Sets the line breaking information for a UTF-32 input string. More...
 
int is_line_breakable (utf32_t char1, utf32_t char2, const char *lang)
 Tells whether a line break can occur between two Unicode characters. More...
 

Variables

static enum BreakAction baTable [LBP_CB][LBP_CB]
 Break action pair table. More...
 
static struct LineBreakPropertiesIndex lb_prop_index [LINEBREAK_INDEX_SIZE]
 Second-level index to the line breaking properties. More...
 

Detailed Description

Implementation of the line breaking algorithm as described in Unicode Standard Annex 14.

Author
Wu Yongwei
Petr Filipsky

Macro Definition Documentation

◆ ENDS_WITH

#define ENDS_WITH (   str,
  suffix 
)    ends_with((str), (suffix), sizeof(suffix) - 1)

◆ LINEBREAK_INDEX_SIZE

#define LINEBREAK_INDEX_SIZE   40

Size of the second-level index to the line breaking properties.

◆ LINEBREAK_UNDEFINED

#define LINEBREAK_UNDEFINED   -1

Special value used internally to indicate an undefined break result.

Enumeration Type Documentation

◆ BreakAction

Enumeration of break actions.

They are used in the break action pair table baTable.

Enumerator
DIR_BRK 

Direct break opportunity.

IND_BRK 

Indirect break opportunity.

CMI_BRK 

Indirect break opportunity for combining marks.

CMP_BRK 

Prohibited break for combining marks.

PRH_BRK 

Prohibited break.

Function Documentation

◆ ends_with()

static __inline int ends_with ( const char *  str,
const char *  suffix,
unsigned  suffixLen 
)
static

Checks whether the str ends with suffix, which has length suffix_len.

Parameters
strstring whose ending is to be checked
suffixstring to check
suffixLenlength of suffix
Returns
non-zero if true; zero otherwise

◆ get_char_lb_class()

static enum LineBreakClass get_char_lb_class ( utf32_t  ch,
const struct LineBreakProperties lbp 
)
static

Gets the line breaking class of a character from a line breaking properties array.

Parameters
chcharacter to check
lbppointer to the line breaking properties array
Returns
the line breaking class if found; LBP_XX otherwise

◆ get_char_lb_class_default()

static enum LineBreakClass get_char_lb_class_default ( utf32_t  ch)
static

Gets the line breaking class of a character from the default line breaking properties array.

Parameters
chcharacter to check
Returns
the line breaking class if found; LBP_XX otherwise

◆ get_char_lb_class_lang()

static enum LineBreakClass get_char_lb_class_lang ( utf32_t  ch,
const struct LineBreakProperties lbpLang 
)
static

Gets the line breaking class of a character for a specific language.

This function will check the language-specific data first, and then the default data if there is no language-specific property available for the character.

Parameters
chcharacter to check
lbpLangpointer to the language-specific line breaking properties array
Returns
the line breaking class if found; LBP_XX otherwise

◆ get_lb_prop_lang()

static const struct LineBreakProperties* get_lb_prop_lang ( const char *  lang)
static

Gets the language-specific line breaking properties.

Parameters
langlanguage of the text
Returns
pointer to the language-specific line breaking properties array if found; NULL otherwise

◆ get_lb_result_lookup()

static int get_lb_result_lookup ( struct LineBreakContext lbpCtx)
static

Tells the line break opportunity by table lookup.

Parameters
[in,out]lbpCtxpointer to the line breaking context
Precondition
lbpCtx->lbcCur has the current line break class; lbpCtx->lbcLast has the line break class for the last character; and lbcCur->lbcNew has the line break class for the next character
Postcondition
lbpCtx->lbcCur has the updated line break class
Returns
break result, one of LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, and LINEBREAK_NOBREAK

◆ get_lb_result_simple()

static int get_lb_result_simple ( struct LineBreakContext lbpCtx)
static

Tries telling the line break opportunity by simple rules.

Parameters
[in,out]lbpCtxpointer to the line breaking context
Precondition
lbpCtx->lbcCur has the current line break class; and lbpCtx->lbcNew has the line break class for the next character
Postcondition
lbpCtx->lbcCur has the updated line break class
Returns
break result, one of LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, and LINEBREAK_NOBREAK if identified; or LINEBREAK_UNDEFINED if table lookup is needed

◆ init_linebreak()

void init_linebreak ( void  )

Initializes the second-level index to the line breaking properties.

If it is not called, the performance of get_char_lb_class_lang (and thus the main functionality) can be pretty bad, especially for big codepoints like those of Chinese.

◆ is_line_breakable()

int is_line_breakable ( utf32_t  char1,
utf32_t  char2,
const char *  lang 
)

Tells whether a line break can occur between two Unicode characters.

This is a wrapper function to expose a simple interface. Generally speaking, it is better to use set_linebreaks_utf32 instead, since complicated cases involving combining marks, spaces, etc. cannot be correctly processed.

Parameters
char1the first Unicode character
char2the second Unicode character
langlanguage of the input
Returns
one of LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR

◆ lb_init_break_context()

void lb_init_break_context ( struct LineBreakContext lbpCtx,
utf32_t  ch,
const char *  lang 
)

Initializes line breaking context for a given language.

Parameters
[in,out]lbpCtxpointer to the line breaking context
[in]chthe first character to process
[in]langlanguage of the input
Postcondition
the line breaking context is initialized

◆ lb_process_next_char()

int lb_process_next_char ( struct LineBreakContext lbpCtx,
utf32_t  ch 
)

Updates LineBreakingContext for the next codepoint and returns the detected break.

Parameters
[in,out]lbpCtxpointer to the line breaking context
[in]chUnicode codepoint
Returns
break result, one of LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, and LINEBREAK_NOBREAK
Postcondition
the line breaking context is updated

◆ resolve_lb_class()

static enum LineBreakClass resolve_lb_class ( enum LineBreakClass  lbc,
const char *  lang 
)
static

Resolves the line breaking class for certain ambiguous or complicated characters.

They are treated in a simplistic way in this implementation.

Parameters
lbcline breaking class to resolve
langlanguage of the text
Returns
the resolved line breaking class

◆ set_linebreaks()

void set_linebreaks ( const void *  s,
size_t  len,
const char *  lang,
char *  brks,
get_next_char_t  get_next_char 
)

Sets the line breaking information for a generic input string.

Currently, this implementation has customization for the following ISO 639-1 language codes (for lang):

  • de (German)
  • en (English)
  • es (Spanish)
  • fr (French)
  • ja (Japanese)
  • ko (Korean)
  • ru (Russian)
  • zh (Chinese)

In addition, a suffix "-strict" may be added to indicate strict (as versus normal) line-breaking behaviour. See the Conditional Japanese Starter section of UAX #14 for more details.

Parameters
[in]sinput string
[in]lenlength of the input
[in]langlanguage of the input
[out]brkspointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR
[in]get_next_charfunction to get the next UTF-32 character

◆ set_linebreaks_utf16()

void set_linebreaks_utf16 ( const utf16_t s,
size_t  len,
const char *  lang,
char *  brks 
)

Sets the line breaking information for a UTF-16 input string.

Parameters
[in]sinput UTF-16 string
[in]lenlength of the input
[in]langlanguage of the input
[out]brkspointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR
See also
set_linebreaks for a note about lang.

◆ set_linebreaks_utf32()

void set_linebreaks_utf32 ( const utf32_t s,
size_t  len,
const char *  lang,
char *  brks 
)

Sets the line breaking information for a UTF-32 input string.

Parameters
[in]sinput UTF-32 string
[in]lenlength of the input
[in]langlanguage of the input
[out]brkspointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR
See also
set_linebreaks for a note about lang.

◆ set_linebreaks_utf8()

void set_linebreaks_utf8 ( const utf8_t s,
size_t  len,
const char *  lang,
char *  brks 
)

Sets the line breaking information for a UTF-8 input string.

Parameters
[in]sinput UTF-8 string
[in]lenlength of the input
[in]langlanguage of the input
[out]brkspointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR
See also
set_linebreaks for a note about lang.

◆ treat_first_char()

static void treat_first_char ( struct LineBreakContext lbpCtx)
static

Treats specially for the first character in a line.

Parameters
[in,out]lbpCtxpointer to the line breaking context
Precondition
lbpCtx->lbcCur has a valid line break class
Postcondition
lbpCtx->lbcCur has the updated line break class

Variable Documentation

◆ baTable

enum BreakAction baTable[LBP_CB][LBP_CB]
static

Break action pair table.

This is a direct mapping of Table 2 of Unicode Standard Annex 14, Revision 37, except for the following:

  • CB (manually added as per LB20)
  • ZWJ (manually adjusted after special processing as per LB8a of Revision 41)
  • CL, CP, NS, SY, IS, PR, PO, HY, BA,B2, and RI (manually adjusted as per LB22 of Revision 45)

◆ lb_prop_index

struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE]
static
Initial value:
=
{
{ 0xFFFFFFFF, lb_prop_default }
}

Second-level index to the line breaking properties.

lb_prop_default
const struct LineBreakProperties lb_prop_default[]
Default line breaking properties as from the Unicode Web site.
Definition: linebreakdata.c:9