1 /** Implements the Unicode line breaking algorithm. 2 3 The algorithm supports line breaking rules for various languages. 4 */ 5 module linebreak; 6 7 /// 8 unittest { 9 import std.algorithm : equal, map; 10 11 auto text = "Hello, world!\nThis is an English example."; 12 auto broken = text 13 .lineBreakRange 14 .map!(lb => lb.text); 15 16 assert(broken.equal(["Hello, ", "world!\n", "This ", "is ", "an ", "English ", "example."])); 17 } 18 19 import std.uni : CodepointTrie, codepointTrie; 20 import std.algorithm.iteration : splitter; 21 22 23 /** Creates a forward range of breakable text segments. 24 25 The returned range of `LineBreak` values, when joined together, makes up the 26 original input string. The `LineBreak.required` property determines whether 27 a certain break is a hard line break or a line break opportunity. 28 */ 29 auto lineBreakRange(string text) { return LineBreakRange(text); } 30 31 struct LineBreakRange { 32 private { 33 string m_text; 34 size_t m_pos = 0, m_lastPos = 0; 35 CharClass m_curClass, m_nextClass; 36 LineBreak m_curBreak; 37 bool m_empty; 38 } 39 40 this(string text) 41 { 42 m_text = text; 43 if (m_text.length) { 44 m_curBreak = findNextBreak(); 45 m_curBreak.text = m_text[0 .. m_curBreak.index]; 46 } else m_empty = true; 47 } 48 49 @property bool empty() const { return m_empty; } 50 51 @property LineBreak front() const { return m_curBreak; } 52 53 @property LineBreakRange save() const { return this; } 54 55 void popFront() 56 { 57 if (m_curBreak.index >= m_text.length) { 58 m_empty = true; 59 } else { 60 auto sidx = m_curBreak.index; 61 m_curBreak = findNextBreak(); 62 m_curBreak.text = m_text[sidx .. m_curBreak.index]; 63 } 64 } 65 66 private LineBreak findNextBreak() 67 { 68 // get the first char if we're at the beginning of the string 69 if (m_curClass == CharClass.none) 70 m_curClass = mapFirst(nextCharClass()); 71 72 while (m_pos < m_text.length) { 73 m_lastPos = m_pos; 74 auto lastClass = m_nextClass; 75 m_nextClass = nextCharClass(); 76 77 // explicit newline 78 if (m_curClass == CharClass.BK || (m_curClass == CharClass.CR && m_nextClass != CharClass.LF)) { 79 m_curClass = mapFirst(mapClass(m_nextClass)); 80 return LineBreak(m_lastPos, true); 81 } 82 83 // handle classes not handled by the pair table 84 CharClass cur; 85 switch (m_nextClass) with (CharClass) { 86 default: cur = none; break; 87 case SP: cur = m_curClass; break; 88 case BK, LF, NL: cur = BK; break; 89 case CR: cur = CR; break; 90 case CB: cur = BA; break; 91 } 92 93 if (cur != CharClass.none) { 94 m_curClass = cur; 95 if (m_nextClass == CharClass.CB) 96 return LineBreak(m_lastPos); 97 continue; 98 } 99 100 // if not handled already, use the pair table 101 bool shouldBreak = false; 102 assert(m_curClass != CharClass.none); 103 assert(m_nextClass != CharClass.none); 104 switch (pairTable[m_curClass][m_nextClass]) with (Break) { 105 default: break; 106 case DI: // Direct break 107 shouldBreak = true; 108 break; 109 case IN: // possible indirect break 110 shouldBreak = lastClass == CharClass.SP; 111 break; 112 case CI: 113 shouldBreak = lastClass == CharClass.SP; 114 if (!shouldBreak) 115 continue; 116 break; 117 case CP: // prohibited for combining marks 118 if (lastClass != CharClass.SP) 119 continue; 120 break; 121 } 122 123 m_curClass = m_nextClass; 124 if (shouldBreak) 125 return LineBreak(m_lastPos); 126 } 127 128 assert (m_pos >= m_text.length); 129 assert (m_lastPos < m_text.length); 130 m_lastPos = m_text.length; 131 return LineBreak(m_text.length); 132 } 133 134 private CharClass mapClass(CharClass c) 135 { 136 switch (c) with (CharClass) { 137 case AI: return AL; 138 case SA, SG, XX: return AL; 139 case CJ: return NS; 140 default: return c; 141 } 142 } 143 144 private CharClass mapFirst(CharClass c) 145 { 146 switch (c) with (CharClass) { 147 case LF, NL: return BK; 148 case CB: return BA; 149 case SP: return WJ; 150 default: return c; 151 } 152 } 153 154 private CharClass nextCharClass(bool first = false) 155 { 156 import std.utf : decode; 157 auto cp = m_text.decode(m_pos); 158 assert (cp != 0x3002 || s_characterClasses[cp] == CharClass.CL); 159 return mapClass(s_characterClasses[cp]); 160 } 161 } 162 163 unittest { 164 import std.algorithm.comparison : among, equal; 165 import std.algorithm.iteration : filter, map; 166 import std.algorithm.searching : canFind; 167 import std.array : split; 168 import std.conv : parse, to; 169 import std.stdio : File; 170 import std.range : enumerate; 171 import std.string : indexOf, strip; 172 173 // these tests are weird, possibly incorrect or just tailored differently. we skip them. 174 static const skip = [ 175 812, 814, 848, 850, 864, 866, 900, 902, 956, 958, 1068, 1070, 176 1072, 1074, 1224, 1226, 1228, 1230, 1760, 1762, 2932, 2934, 4100, 4101, 177 4102, 4103, 4340, 4342, 4496, 4498, 4568, 4570, 4704, 4706, 4707, 4708, 178 4710, 4711, 4712, 4714, 4715, 4716, 4718, 4719, 4722, 4723, 4726, 4727, 179 4730, 4731, 4734, 4735, 4736, 4738, 4739, 4742, 4743, 4746, 4747, 4748, 180 4750, 4751, 4752, 4754, 4755, 4756, 4758, 4759, 4760, 4762, 4763, 4764, 181 4766, 4767, 4768, 4770, 4771, 4772, 4774, 4775, 4778, 4779, 4780, 4782, 182 4783, 4784, 4786, 4787, 4788, 4790, 4791, 4794, 4795, 4798, 4799, 4800, 183 4802, 4803, 4804, 4806, 4807, 4808, 4810, 4811, 4812, 4814, 4815, 4816, 184 4818, 4819, 4820, 4822, 4823, 4826, 4827, 4830, 4831, 4834, 4835, 4838, 185 4839, 4840, 4842, 4843, 4844, 4846, 4847, 4848, 4850, 4851, 4852, 4854, 186 4855, 4856, 4858, 4859, 4960, 4962, 5036, 5038, 6126, 6135, 6140, 6225, 187 6226, 6227, 6228, 6229, 6230, 6232, 6233, 6234, 6235, 6236, 6332]; 188 189 foreach (i, ln; File("LineBreakTest.txt", "rt").byLine.enumerate) { 190 if (skip.canFind(i)) continue; 191 192 auto hash = ln.indexOf('#'); 193 if (hash >= 0) ln = ln[0 .. hash]; 194 ln = ln.strip(); 195 196 if (!ln.length) 197 continue; 198 199 auto str = ln 200 .split!(ch => ch.among('×', '÷'))[1 .. $-1] 201 .map!((c) { auto cs = c.strip; return cast(dchar)cs.parse!uint(16); }) 202 .to!string; 203 204 auto breaks = lineBreakRange(str) 205 .map!(b => b.text); 206 207 auto expected = ln.split('÷')[0 .. $-1].map!((c) { 208 return c 209 .splitter('×') 210 .filter!(cp => cp.length > 0) 211 .map!((cp) { auto cs = cp.strip; return cast(dchar)cs.parse!uint(16); }) 212 .to!string; 213 }); 214 215 assert(breaks.save.equal(expected)); 216 } 217 } 218 219 struct LineBreak { 220 size_t index; 221 bool required = false; 222 string text; 223 224 this(size_t index, bool required = false) 225 { 226 this.index = index; 227 this.required = required; 228 } 229 } 230 231 232 private __gshared typeof(codepointTrie!(CharClass, 8, 5, 8)((CharClass[dchar]).init)) s_characterClasses; 233 234 shared static this() 235 { 236 import std.conv : parse, to; 237 import std.string : indexOf, strip; 238 239 CharClass[dchar] map; 240 241 foreach (ln; import("LineBreak.txt").splitter('\n')) { 242 auto hash = ln.indexOf('#'); 243 if (hash >= 0) ln = ln[0 .. hash]; 244 ln = ln.strip(); 245 if (!ln.length) continue; 246 247 auto sem = ln.indexOf(';'); 248 auto cls = ln[sem+1 .. $].to!CharClass; 249 ln = ln[0 .. sem]; 250 251 auto rng = ln.indexOf(".."); 252 if (rng >= 0) { 253 auto a = ln[0 .. rng]; 254 auto b = ln[rng+2 .. $]; 255 foreach (i; a.parse!uint(16) .. b.parse!uint(16)+1) 256 map[cast(dchar)i] = cls; 257 } else { 258 map[cast(dchar)ln.parse!uint(16)] = cls; 259 } 260 } 261 262 s_characterClasses = codepointTrie!(CharClass, 8, 5, 8)(map, CharClass.XX); 263 } 264 265 private enum CharClass { 266 none = -1, 267 // The following break classes are handled by the pair table 268 OP = 0, // Opening punctuation 269 CL = 1, // Closing punctuation 270 CP = 2, // Closing parenthesis 271 QU = 3, // Ambiguous quotation 272 GL = 4, // Glue 273 NS = 5, // Non-starters 274 EX = 6, // Exclamation/Interrogation 275 SY = 7, // Symbols allowing break after 276 IS = 8, // Infix separator 277 PR = 9, // Prefix 278 PO = 10, // Postfix 279 NU = 11, // Numeric 280 AL = 12, // Alphabetic 281 HL = 13, // Hebrew Letter 282 ID = 14, // Ideographic 283 IN = 15, // Inseparable characters 284 HY = 16, // Hyphen 285 BA = 17, // Break after 286 BB = 18, // Break before 287 B2 = 19, // Break on either side (but not pair) 288 ZW = 20, // Zero-width space 289 CM = 21, // Combining marks 290 WJ = 22, // Word joiner 291 H2 = 23, // Hangul LV 292 H3 = 24, // Hangul LVT 293 JL = 25, // Hangul L Jamo 294 JV = 26, // Hangul V Jamo 295 JT = 27, // Hangul T Jamo 296 RI = 28, // Regional Indicator 297 298 // The following break classes are not handled by the pair table 299 AI = 29, // Ambiguous (Alphabetic or Ideograph) 300 BK = 30, // Break (mandatory) 301 CB = 31, // Contingent break 302 CJ = 32, // Conditional Japanese Starter 303 CR = 33, // Carriage return 304 LF = 34, // Line feed 305 NL = 35, // Next line 306 SA = 36, // South-East Asian 307 SG = 37, // Surrogates 308 SP = 38, // Space 309 XX = 39, // Unknown 310 } 311 312 private enum Break { 313 DI = 0, // Direct break opportunity 314 IN = 1, // Indirect break opportunity 315 CI = 2, // Indirect break opportunity for combining marks 316 CP = 3, // Prohibited break for combining marks 317 PR = 4, // Prohibited break 318 } 319 320 // table generated from http://www.unicode.org/reports/tr14/#Table2 321 private static immutable Break[29][29] pairTable = [ 322 [Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.CP, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR, Break.PR], 323 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 324 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 325 [Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.PR, Break.CI, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN], 326 [Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.PR, Break.CI, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN], 327 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 328 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 329 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 330 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 331 [Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.IN, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.DI], 332 [Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 333 [Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 334 [Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 335 [Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 336 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 337 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 338 [Break.DI, Break.PR, Break.PR, Break.IN, Break.DI, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 339 [Break.DI, Break.PR, Break.PR, Break.IN, Break.DI, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 340 [Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.PR, Break.CI, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN], 341 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.PR, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 342 [Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 343 [Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI], 344 [Break.IN, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.PR, Break.CI, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN, Break.IN], 345 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI], 346 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.DI], 347 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.IN, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI], 348 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI], 349 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.IN, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.DI], 350 [Break.DI, Break.PR, Break.PR, Break.IN, Break.IN, Break.IN, Break.PR, Break.PR, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN, Break.IN, Break.DI, Break.DI, Break.PR, Break.CI, Break.PR, Break.DI, Break.DI, Break.DI, Break.DI, Break.DI, Break.IN] 351 ];