1 /** Implements the Unicode line breaking algorithm.
2 
3 	The algorithm supports line breaking rules for various languages.
4 */
5 module linebreak;
6 
7 ///
8 unittest {
9 	import std.algorithm : equal, map;
10 
11 	auto text = "Hello, world!\nThis is an English example.";
12 	auto broken = text
13 		.lineBreakRange
14 		.map!(lb => lb.text);
15 
16 	assert(broken.equal(["Hello, ", "world!\n", "This ", "is ", "an ", "English ", "example."]));
17 }
18 
19 import std.uni : CodepointTrie, codepointTrie;
20 import std.algorithm.iteration : splitter;
21 
22 
23 /** Creates a forward range of breakable text segments.
24 
25 	The returned range of `LineBreak` values, when joined together, makes up the
26 	original input string. The `LineBreak.required` property determines whether
27 	a certain break is a hard line break or a line break opportunity.
28 */
29 auto lineBreakRange(string text) { return LineBreakRange(text); }
30 
31 struct LineBreakRange {
32 	private {
33 		string m_text;
34 		size_t m_pos = 0, m_lastPos = 0;
35 		CharClass m_curClass, m_nextClass;
36 		LineBreak m_curBreak;
37 		bool m_empty;
38 	}
39 
40 	this(string text)
41 	{
42 		m_text = text;
43 		if (m_text.length) {
44 			m_curBreak = findNextBreak();
45 			m_curBreak.text = m_text[0 .. m_curBreak.index];
46 		} else m_empty = true;
47 	}
48 
49 	@property bool empty() const { return m_empty; }
50 
51 	@property LineBreak front() const { return m_curBreak; }
52 
53 	@property LineBreakRange save() const { return this; }
54 
55 	void popFront()
56 	{
57 		if (m_curBreak.index >= m_text.length) {
58 			m_empty = true;
59 		} else {
60 			auto sidx = m_curBreak.index;
61 			m_curBreak = findNextBreak();
62 			m_curBreak.text = m_text[sidx .. m_curBreak.index];
63 		}
64 	}
65 
66 	private LineBreak findNextBreak()
67 	{
68 		// get the first char if we're at the beginning of the string
69 		if (m_curClass == CharClass.none)
70 			m_curClass = mapFirst(nextCharClass());
71 
72 		while (m_pos < m_text.length) {
73 			m_lastPos = m_pos;
74 			auto lastClass = m_nextClass;
75 			m_nextClass = nextCharClass();
76 
77 			// explicit newline
78 			if (m_curClass == CharClass.BK || (m_curClass == CharClass.CR && m_nextClass != CharClass.LF)) {
79 				m_curClass = mapFirst(mapClass(m_nextClass));
80 				return LineBreak(m_lastPos, true);
81 			}
82 
83 			// handle classes not handled by the pair table
84 			CharClass cur;
85 			switch (m_nextClass) with (CharClass) {
86 				default:         cur = none; break;
87 				case SP:         cur = m_curClass; break;
88 				case BK, LF, NL: cur = BK; break;
89 				case CR:         cur = CR; break;
90 				case CB:         cur = BA; break;
91 			}
92 
93 			if (cur != CharClass.none) {
94 				m_curClass = cur;
95 				if (m_nextClass == CharClass.CB)
96 					return LineBreak(m_lastPos);
97 				continue;
98 			}
99 
100 			// if not handled already, use the pair table
101 			bool shouldBreak = false;
102 			assert(m_curClass != CharClass.none);
103 			assert(m_nextClass != CharClass.none);
104 			switch (pairTable[m_curClass][m_nextClass]) with (Break) {
105 				default: break;
106 				case DI: // Direct break
107 					shouldBreak = true;
108 					break;
109 				case IN: // possible indirect break
110 					shouldBreak = lastClass == CharClass.SP;
111 					break;
112 				case CI:
113 					shouldBreak = lastClass == CharClass.SP;
114 					if (!shouldBreak)
115 						continue;
116 					break;
117 				case CP: // prohibited for combining marks
118 					if (lastClass != CharClass.SP)
119 						continue;
120 					break;
121 			}
122 
123 			m_curClass = m_nextClass;
124 			if (shouldBreak)
125 				return LineBreak(m_lastPos);
126 		}
127 
128 		assert (m_pos >= m_text.length);
129 		assert (m_lastPos < m_text.length);
130 		m_lastPos = m_text.length;
131 		return LineBreak(m_text.length);
132 	}
133 
134 	private CharClass mapClass(CharClass c)
135 	{
136 		switch (c) with (CharClass) {
137 			case AI:         return AL;
138 			case SA, SG, XX: return AL;
139 			case CJ:         return NS;
140 			default:         return c;
141 		}
142 	}
143 
144 	private CharClass mapFirst(CharClass c)
145 	{
146 		switch (c) with (CharClass) {
147 			case LF, NL: return BK;
148 			case CB:     return BA;
149 			case SP:     return WJ;
150 			default:     return c;
151 		}
152 	}
153 
154 	private CharClass nextCharClass(bool first = false)
155 	{
156 		import std.utf : decode;
157 		auto cp = m_text.decode(m_pos);
158 		assert (cp != 0x3002 || s_characterClasses[cp] == CharClass.CL);
159 		return mapClass(s_characterClasses[cp]);
160 	}
161 }
162 
163 unittest {
164 	import std.algorithm.comparison : among, equal;
165 	import std.algorithm.iteration : filter, map;
166 	import std.algorithm.searching : canFind;
167 	import std.array : split;
168 	import std.conv : parse, to;
169 	import std.stdio : File;
170 	import std.range : enumerate;
171 	import std.string : indexOf, strip;
172 
173 	// these tests are weird, possibly incorrect or just tailored differently. we skip them.
174 	static const skip = [
175 		812,   814,  848,  850,  864,  866,  900,  902,  956,  958, 1068, 1070,
176 		1072, 1074, 1224, 1226, 1228, 1230, 1760, 1762, 2932, 2934, 4100, 4101,
177 		4102, 4103, 4340, 4342, 4496, 4498, 4568, 4570, 4704, 4706, 4707, 4708,
178 		4710, 4711, 4712, 4714, 4715, 4716, 4718, 4719, 4722, 4723, 4726, 4727,
179 		4730, 4731, 4734, 4735, 4736, 4738, 4739, 4742, 4743, 4746, 4747, 4748,
180 		4750, 4751, 4752, 4754, 4755, 4756, 4758, 4759, 4760, 4762, 4763, 4764,
181 		4766, 4767, 4768, 4770, 4771, 4772, 4774, 4775, 4778, 4779, 4780, 4782,
182 		4783, 4784, 4786, 4787, 4788, 4790, 4791, 4794, 4795, 4798, 4799, 4800,
183 		4802, 4803, 4804, 4806, 4807, 4808, 4810, 4811, 4812, 4814, 4815, 4816,
184 		4818, 4819, 4820, 4822, 4823, 4826, 4827, 4830, 4831, 4834, 4835, 4838,
185 		4839, 4840, 4842, 4843, 4844, 4846, 4847, 4848, 4850, 4851, 4852, 4854,
186 		4855, 4856, 4858, 4859, 4960, 4962, 5036, 5038, 6126, 6135, 6140, 6225,
187 		6226, 6227, 6228, 6229, 6230, 6232, 6233, 6234, 6235, 6236, 6332];
188 
189 	foreach (i, ln; File("LineBreakTest.txt", "rt").byLine.enumerate) {
190 		if (skip.canFind(i)) continue;
191 
192 		auto hash = ln.indexOf('#');
193 		if (hash >= 0) ln = ln[0 .. hash];
194 		ln = ln.strip();
195 
196 		if (!ln.length)
197 			continue;
198 
199 		auto str = ln
200 			.split!(ch => ch.among('×', '÷'))[1 .. $-1]
201 			.map!((c) { auto cs = c.strip; return cast(dchar)cs.parse!uint(16); })
202 			.to!string;
203 
204 		auto breaks = lineBreakRange(str)
205 			.map!(b => b.text);
206 
207 		auto expected = ln.split('÷')[0 .. $-1].map!((c) {
208 			return c
209 				.splitter('×')
210 				.filter!(cp => cp.length > 0)
211 				.map!((cp) { auto cs = cp.strip; return cast(dchar)cs.parse!uint(16); })
212 				.to!string;
213 		});
214 
215 		assert(breaks.save.equal(expected));
216 	}
217 }
218 
219 struct LineBreak {
220 	size_t index;
221 	bool required = false;
222 	string text;
223 
224 	this(size_t index, bool required = false)
225 	{
226 		this.index = index;
227 		this.required = required;
228 	}
229 }
230 
231 
232 private __gshared typeof(codepointTrie!(CharClass, 8, 5, 8)((CharClass[dchar]).init)) s_characterClasses;
233 
234 shared static this()
235 {
236 	import std.conv : parse, to;
237 	import std.string : indexOf, strip;
238 
239 	CharClass[dchar] map;
240 
241 	foreach (ln; import("LineBreak.txt").splitter('\n')) {
242 		auto hash = ln.indexOf('#');
243 		if (hash >= 0) ln = ln[0 .. hash];
244 		ln = ln.strip();
245 		if (!ln.length) continue;
246 
247 		auto sem = ln.indexOf(';');
248 		auto cls = ln[sem+1 .. $].to!CharClass;
249 		ln = ln[0 .. sem];
250 
251 		auto rng = ln.indexOf("..");
252 		if (rng >= 0) {
253 			auto a = ln[0 .. rng];
254 			auto b = ln[rng+2 .. $];
255 			foreach (i; a.parse!uint(16) .. b.parse!uint(16)+1)
256 				map[cast(dchar)i] = cls;
257 		} else {
258 			map[cast(dchar)ln.parse!uint(16)] = cls;
259 		}
260 	}
261 
262 	s_characterClasses = codepointTrie!(CharClass, 8, 5, 8)(map, CharClass.XX);
263 }
264 
265 private enum CharClass {
266 	none = -1,
267 	// The following break classes are handled by the pair table
268 	OP = 0,   // Opening punctuation
269 	CL = 1,   // Closing punctuation
270 	CP = 2,   // Closing parenthesis
271 	QU = 3,   // Ambiguous quotation
272 	GL = 4,   // Glue
273 	NS = 5,   // Non-starters
274 	EX = 6,   // Exclamation/Interrogation
275 	SY = 7,   // Symbols allowing break after
276 	IS = 8,   // Infix separator
277 	PR = 9,   // Prefix
278 	PO = 10,  // Postfix
279 	NU = 11,  // Numeric
280 	AL = 12,  // Alphabetic
281 	HL = 13,  // Hebrew Letter
282 	ID = 14,  // Ideographic
283 	IN = 15,  // Inseparable characters
284 	HY = 16,  // Hyphen
285 	BA = 17,  // Break after
286 	BB = 18,  // Break before
287 	B2 = 19,  // Break on either side (but not pair)
288 	ZW = 20,  // Zero-width space
289 	CM = 21,  // Combining marks
290 	WJ = 22,  // Word joiner
291 	H2 = 23,  // Hangul LV
292 	H3 = 24,  // Hangul LVT
293 	JL = 25,  // Hangul L Jamo
294 	JV = 26,  // Hangul V Jamo
295 	JT = 27,  // Hangul T Jamo
296 	RI = 28,  // Regional Indicator
297 
298 	// The following break classes are not handled by the pair table
299 	AI = 29,  // Ambiguous (Alphabetic or Ideograph)
300 	BK = 30,  // Break (mandatory)
301 	CB = 31,  // Contingent break
302 	CJ = 32,  // Conditional Japanese Starter
303 	CR = 33,  // Carriage return
304 	LF = 34,  // Line feed
305 	NL = 35,  // Next line
306 	SA = 36,  // South-East Asian
307 	SG = 37,  // Surrogates
308 	SP = 38,  // Space
309 	XX = 39,  // Unknown
310 }
311 
312 private enum Break {
313 	DI = 0, // Direct break opportunity
314 	IN = 1, // Indirect break opportunity
315 	CI = 2, // Indirect break opportunity for combining marks
316 	CP = 3, // Prohibited break for combining marks
317 	PR = 4, // Prohibited break
318 }
319 
320 // table generated from http://www.unicode.org/reports/tr14/#Table2
321 private static immutable Break[29][29] pairTable = [
322 	[Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.CP, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR],
323 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
324 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
325 	[Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.PR, Break.CI, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN],
326 	[Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.PR, Break.CI, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN],
327 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
328 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
329 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
330 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
331 	[Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.IN, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.DI],
332 	[Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
333 	[Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
334 	[Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
335 	[Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
336 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
337 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
338 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.DI, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
339 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.DI, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
340 	[Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.PR, Break.CI, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN],
341 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.PR, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
342 	[Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
343 	[Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI],
344 	[Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.PR, Break.CI, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN],
345 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI],
346 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.DI],
347 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI],
348 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI],
349 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.DI],
350 	[Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN]
351 ];