1: <?php declare(strict_types=1);
2:
3: namespace Salient\Utility;
4:
5: use Closure;
6: use InvalidArgumentException;
7:
8: /**
9: * Work with strings
10: *
11: * @api
12: */
13: final class Str extends AbstractUtility
14: {
15: public const ALPHA = Str::LOWER . Str::UPPER;
16: public const ALPHANUMERIC = Str::ALPHA . Str::NUMERIC;
17: public const HEX = '0123456789abcdefABCDEF';
18: public const LOWER = 'abcdefghijklmnopqrstuvwxyz';
19: public const NUMERIC = '0123456789';
20: public const UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
21: public const PRESERVE_DOUBLE_QUOTED = 1;
22: public const PRESERVE_SINGLE_QUOTED = 2;
23: public const PRESERVE_QUOTED = Str::PRESERVE_DOUBLE_QUOTED | Str::PRESERVE_SINGLE_QUOTED;
24: public const ASCII_EXTENDED = "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff";
25:
26: /**
27: * Default value of mergeLists() parameter $itemRegex
28: */
29: public const DEFAULT_ITEM_REGEX = '/^(?<indent>\h*[-*] )/';
30:
31: /**
32: * Get the first string that is not null or empty, or return the last value
33: */
34: public static function coalesce(?string ...$strings): ?string
35: {
36: $string = null;
37: foreach ($strings as $string) {
38: if ($string === null || $string === '') {
39: continue;
40: }
41: return $string;
42: }
43: return $string;
44: }
45:
46: /**
47: * Convert an ASCII string to lowercase
48: */
49: public static function lower(string $string): string
50: {
51: return strtr($string, self::UPPER, self::LOWER);
52: }
53:
54: /**
55: * Convert an ASCII string to uppercase
56: */
57: public static function upper(string $string): string
58: {
59: return strtr($string, self::LOWER, self::UPPER);
60: }
61:
62: /**
63: * Make the first character in an ASCII string uppercase
64: */
65: public static function upperFirst(string $string): string
66: {
67: if ($string === '') {
68: return $string;
69: }
70: $string[0] = self::upper($string[0]);
71: return $string;
72: }
73:
74: /**
75: * Match an ASCII string's case to another string
76: */
77: public static function matchCase(string $string, string $match): string
78: {
79: $match = trim($match);
80:
81: if ($match === '') {
82: return $string;
83: }
84:
85: $upper = strpbrk($match, self::UPPER);
86: $hasUpper = $upper !== false;
87: $hasLower = strpbrk($match, self::LOWER) !== false;
88:
89: if ($hasUpper && !$hasLower && strlen($match) > 1) {
90: return self::upper($string);
91: }
92:
93: if (!$hasUpper && $hasLower) {
94: return self::lower($string);
95: }
96:
97: if (
98: // @phpstan-ignore booleanNot.alwaysTrue
99: (!$hasUpper && !$hasLower)
100: || $upper !== $match
101: ) {
102: return $string;
103: }
104:
105: return self::upperFirst(self::lower($string));
106: }
107:
108: /**
109: * Check if a string starts with a given substring
110: *
111: * @param iterable<string>|string $needles
112: */
113: public static function startsWith(string $haystack, $needles, bool $ignoreCase = false): bool
114: {
115: if (!is_iterable($needles)) {
116: $needles = [$needles];
117: }
118: if ($ignoreCase) {
119: $haystack = self::lower($haystack);
120: $needles = Arr::lower($needles);
121: }
122: foreach ($needles as $needle) {
123: if ($needle !== '' && strpos($haystack, $needle) === 0) {
124: return true;
125: }
126: }
127: return false;
128: }
129:
130: /**
131: * Check if a string ends with a given substring
132: *
133: * @param iterable<string>|string $needles
134: */
135: public static function endsWith(string $haystack, $needles, bool $ignoreCase = false): bool
136: {
137: if (!is_iterable($needles)) {
138: $needles = [$needles];
139: }
140: if ($ignoreCase) {
141: $haystack = self::lower($haystack);
142: $needles = Arr::lower($needles);
143: }
144: foreach ($needles as $needle) {
145: if ($needle !== '' && substr($haystack, -strlen($needle)) === $needle) {
146: return true;
147: }
148: }
149: return false;
150: }
151:
152: /**
153: * Check if every character in a string has a codepoint between 0 and 127
154: */
155: public static function isAscii(string $string): bool
156: {
157: return strcspn($string, self::ASCII_EXTENDED) === strlen($string);
158: }
159:
160: /**
161: * Escape special characters in a string for use in Markdown
162: */
163: public static function escapeMarkdown(string $string): string
164: {
165: return Regex::replace(
166: <<<'REGEX'
167: / [*<[\\`|] |
168: (?<= [\h[:punct:]] (?: (?<! _ ) | (?<= \G ) ) | ^ ) _ |
169: _ (?= _*+ (?: [\h[:punct:]] | $ | \R ) ) |
170: (?<! ~ ) ~ (?= ~ (?! ~ ) ) |
171: ^ \h* \K (?: > | ~ (?= ~~+ ) | (?: \# {1,6} | [+-] | [0-9]+ \K \. ) (?= \h ) ) /mx
172: REGEX,
173: '\\\\$0',
174: $string,
175: );
176: }
177:
178: /**
179: * Normalise a string for comparison
180: *
181: * The return value of this method is not covered by the Salient toolkit's
182: * backward compatibility promise.
183: */
184: public static function normalise(string $string): string
185: {
186: // 4. Remove leading and trailing whitespace
187: // 5. Convert ASCII characters to uppercase
188: return self::upper(trim(Regex::replace([
189: // 1. Replace "&" with " and "
190: '/([[:alnum:]][^&]*+)&(?=[^&[:alnum:]]*+[[:alnum:]])/u',
191: // 2. Remove "."
192: '/\.++/',
193: // 3. Replace non-alphanumeric character sequences with " "
194: '/[^[:alnum:]]++/u',
195: ], [
196: '$1 and ',
197: '',
198: ' ',
199: ], $string)));
200: }
201:
202: /**
203: * Replace the end of a string with an ellipsis ("...") if its length
204: * exceeds a limit
205: */
206: public static function ellipsize(string $value, int $length): string
207: {
208: if ($length < 3) {
209: $length = 3;
210: }
211: if (mb_strlen($value) > $length) {
212: return rtrim(mb_substr($value, 0, $length - 3)) . '...';
213: }
214:
215: return $value;
216: }
217:
218: /**
219: * Apply an end-of-line sequence to a string
220: */
221: public static function setEol(string $string, string $eol = "\n"): string
222: {
223: switch ($eol) {
224: case "\n":
225: return str_replace(["\r\n", "\r"], $eol, $string);
226:
227: case "\r":
228: return str_replace(["\r\n", "\n"], $eol, $string);
229:
230: case "\r\n":
231: return str_replace(["\r\n", "\r", "\n"], ["\n", "\n", $eol], $string);
232:
233: default:
234: return str_replace("\n", $eol, self::setEol($string));
235: }
236: }
237:
238: /**
239: * Remove native end-of-line sequences from the end of a string
240: */
241: public static function trimNativeEol(string $string): string
242: {
243: if (\PHP_EOL === "\n") {
244: $s = rtrim($string, "\n");
245: if ($s === $string || $s === '' || $s[-1] !== "\r") {
246: return $s;
247: }
248: return "$s\n";
249: }
250:
251: $length = strlen(\PHP_EOL);
252: while (substr($string, -$length) === \PHP_EOL) {
253: $string = substr($string, 0, -$length);
254: }
255:
256: return $string;
257: }
258:
259: /**
260: * Replace newline characters in a string with the native end-of-line
261: * sequence
262: */
263: public static function eolToNative(string $string): string
264: {
265: return \PHP_EOL === "\n"
266: ? $string
267: : str_replace("\n", \PHP_EOL, $string);
268: }
269:
270: /**
271: * Replace native end-of-line sequences in a string with the newline
272: * character
273: */
274: public static function eolFromNative(string $string): string
275: {
276: return \PHP_EOL === "\n"
277: ? $string
278: : str_replace(\PHP_EOL, "\n", $string);
279: }
280:
281: /**
282: * Convert words in an arbitrarily capitalised string to snake_case,
283: * optionally preserving non-word characters
284: */
285: public static function snake(string $string, string $preserve = ''): string
286: {
287: return self::lower(self::words($string, '_', $preserve));
288: }
289:
290: /**
291: * Convert words in an arbitrarily capitalised string to kebab-case,
292: * optionally preserving non-word characters
293: */
294: public static function kebab(string $string, string $preserve = ''): string
295: {
296: return self::lower(self::words($string, '-', $preserve));
297: }
298:
299: /**
300: * Convert words in an arbitrarily capitalised string to camelCase,
301: * optionally preserving non-word characters
302: */
303: public static function camel(string $string, string $preserve = ''): string
304: {
305: return Regex::replaceCallback(
306: '/(?<![[:alnum:]])[[:alpha:]]/u',
307: fn($matches) => self::lower($matches[0]),
308: self::pascal($string, $preserve),
309: );
310: }
311:
312: /**
313: * Convert words in an arbitrarily capitalised string to PascalCase,
314: * optionally preserving non-word characters
315: */
316: public static function pascal(string $string, string $preserve = ''): string
317: {
318: return self::words($string, '', $preserve, fn($string) => self::upperFirst(self::lower($string)));
319: }
320:
321: /**
322: * Get words from an arbitrarily capitalised string and delimit them with a
323: * separator, optionally preserving non-word characters and applying a
324: * callback to each word
325: *
326: * A word consists of one or more letters of the same case, or one uppercase
327: * letter followed by zero or more lowercase letters. Numbers are treated as
328: * lowercase letters except that two or more uppercase letters form one word
329: * with any subsequent numbers.
330: *
331: * @param string $preserve Non-alphanumeric characters to preserve.
332: * @param (Closure(string): string)|null $callback
333: */
334: public static function words(
335: string $string,
336: string $separator = ' ',
337: string $preserve = '',
338: ?Closure $callback = null
339: ): string {
340: $notAfterPreserve = '';
341: if ($preserve !== '') {
342: $preserve = Regex::replace('/[[:alnum:]]++/u', '', $preserve);
343: if ($preserve !== '') {
344: $preserve = Regex::quoteCharacterClass($preserve, '/');
345: // Prevent "key=value" becoming "key= value" when preserving "="
346: // by asserting that when separating words, they must appear:
347: // - immediately after the previous word (\G),
348: // - after an unpreserved character, or
349: // - at a word boundary (e.g. "Value" in "key=someValue")
350: if ($separator !== '') {
351: $notAfterPreserve = '(?:\G'
352: . "|(?<=[^[:alnum:]{$preserve}])"
353: . '|(?<=[[:lower:][:digit:]])(?=[[:upper:]]))';
354: }
355: }
356: }
357: $preserve = "[:alnum:]{$preserve}";
358: $word = '(?:[[:upper:]]?[[:lower:][:digit:]]++'
359: . '|(?:[[:upper:]](?![[:lower:]]))++[[:digit:]]*+)';
360:
361: // Insert separators before words to prevent "foo bar" becoming "foobar"
362: if ($separator !== '') {
363: if (Regex::match("/[{$preserve}]/u", $separator)) {
364: throw new InvalidArgumentException('Invalid separator (preserved characters cannot be used)');
365: }
366: $separator = Regex::quoteReplacement($separator);
367: $string = Regex::replace(
368: "/$notAfterPreserve$word/u",
369: $separator . '$0',
370: $string,
371: );
372: }
373:
374: if ($callback !== null) {
375: $string = Regex::replaceCallback(
376: "/$word/u",
377: fn($matches) => $callback($matches[0]),
378: $string,
379: );
380: }
381:
382: // Trim unpreserved characters from the beginning and end of the string,
383: // then replace sequences of them with one separator
384: return Regex::replace([
385: "/^[^{$preserve}]++|[^{$preserve}]++\$/Du",
386: "/[^{$preserve}]++/u",
387: ], [
388: '',
389: $separator,
390: ], $string);
391: }
392:
393: /**
394: * Expand tabs in a string to spaces
395: *
396: * @param int $column The starting column (1-based) of `$text`.
397: */
398: public static function expandTabs(
399: string $string,
400: int $tabSize = 8,
401: int $column = 1
402: ): string {
403: if (strpos($string, "\t") === false) {
404: return $string;
405: }
406: $eol = Get::eol($string) ?? "\n";
407: $expanded = '';
408: foreach (explode($eol, $string) as $i => $line) {
409: !$i || $expanded .= $eol;
410: $parts = explode("\t", $line);
411: $last = array_key_last($parts);
412: foreach ($parts as $p => $part) {
413: $expanded .= $part;
414: if ($p === $last) {
415: break;
416: }
417: $column += mb_strlen($part);
418: // e.g. with $tabSize 4, a tab at $column 2 occupies 3 spaces
419: $spaces = $tabSize - (($column - 1) % $tabSize);
420: $expanded .= str_repeat(' ', $spaces);
421: $column += $spaces;
422: }
423: $column = 1;
424: }
425: return $expanded;
426: }
427:
428: /**
429: * Expand leading tabs in a string to spaces
430: *
431: * @param bool $preserveLine1 If `true`, tabs in the first line of `$text`
432: * are not expanded.
433: * @param int $column The starting column (1-based) of `$text`.
434: */
435: public static function expandLeadingTabs(
436: string $string,
437: int $tabSize = 8,
438: bool $preserveLine1 = false,
439: int $column = 1
440: ): string {
441: if (strpos($string, "\t") === false) {
442: return $string;
443: }
444: $eol = Get::eol($string) ?? "\n";
445: $softTab = str_repeat(' ', $tabSize);
446: $expanded = '';
447: foreach (explode($eol, $string) as $i => $line) {
448: !$i || $expanded .= $eol;
449: if ($i || (!$preserveLine1 && $column === 1)) {
450: $expanded .= Regex::replace('/(?<=\n|\G)\t/', $softTab, $line);
451: continue;
452: }
453: if ($preserveLine1) {
454: $expanded .= $line;
455: continue;
456: }
457: $parts = explode("\t", $line);
458: while (($part = array_shift($parts)) !== null) {
459: $expanded .= $part;
460: if (!$parts) {
461: break;
462: }
463: if ($part !== '') {
464: $expanded .= "\t" . implode("\t", $parts);
465: break;
466: }
467: $column += mb_strlen($part);
468: $spaces = $tabSize - (($column - 1) % $tabSize);
469: $expanded .= str_repeat(' ', $spaces);
470: $column += $spaces;
471: }
472: }
473: return $expanded;
474: }
475:
476: /**
477: * Copy a string to a temporary stream
478: *
479: * @return resource
480: */
481: public static function toStream(string $string)
482: {
483: $stream = File::open('php://temp', 'r+');
484: File::writeAll($stream, $string);
485: File::rewind($stream);
486: return $stream;
487: }
488:
489: /**
490: * Split a string by a string, trim substrings and remove any empty strings
491: *
492: * @param non-empty-string $separator
493: * @param int|null $limit The maximum number of substrings to return. If
494: * given, empty strings are not removed.
495: * @param string|null $characters Characters to trim, `null` (the default)
496: * to trim whitespace, or an empty string to trim nothing.
497: * @return ($removeEmpty is true ? list<string> : non-empty-list<string>)
498: */
499: public static function split(
500: string $separator,
501: string $string,
502: ?int $limit = null,
503: bool $removeEmpty = true,
504: ?string $characters = null
505: ): array {
506: if ($limit !== null) {
507: $removeEmpty = false;
508: }
509: $split = Arr::trim(
510: explode($separator, $string, $limit ?? \PHP_INT_MAX),
511: $characters,
512: $removeEmpty
513: );
514: return $removeEmpty ? $split : array_values($split);
515: }
516:
517: /**
518: * Split a string by a string without splitting bracket-delimited or
519: * double-quoted substrings, trim substrings and remove any empty strings
520: *
521: * @param non-empty-string $separator
522: * @param string|null $characters Characters to trim, `null` (the default)
523: * to trim whitespace, or an empty string to trim nothing.
524: * @param int-mask-of<Str::PRESERVE_*> $flags
525: * @return ($removeEmpty is true ? list<string> : non-empty-list<string>)
526: */
527: public static function splitDelimited(
528: string $separator,
529: string $string,
530: bool $removeEmpty = true,
531: ?string $characters = null,
532: int $flags = Str::PRESERVE_DOUBLE_QUOTED
533: ): array {
534: if (strlen($separator) !== 1) {
535: throw new InvalidArgumentException('Separator must be a single character');
536: }
537:
538: $quotes = '';
539: $regex = '';
540: if ($flags & self::PRESERVE_DOUBLE_QUOTED) {
541: $quotes .= '"';
542: $regex .= ' | " (?: [^"\\\\] | \\\\ . )*+ "';
543: }
544: if ($flags & self::PRESERVE_SINGLE_QUOTED) {
545: $quotes .= "'";
546: $regex .= " | ' (?: [^'\\\\] | \\\\ . )*+ '";
547: }
548:
549: if (strpos('()<>[]{}' . $quotes, $separator) !== false) {
550: throw new InvalidArgumentException('Separator cannot be a delimiter');
551: }
552:
553: $quoted = preg_quote($separator, '/');
554: $escaped = Regex::quoteCharacterClass($separator, '/');
555:
556: $regex = <<<REGEX
557: (?x)
558: (?: [^{$quotes}()<>[\]{}{$escaped}]++ |
559: ( \( (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ \) |
560: < (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ > |
561: \[ (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ \] |
562: \{ (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ \}{$regex} ) |
563: # Match empty substrings
564: (?<= $quoted | ^ ) (?= $quoted | \$ ) )+
565: REGEX;
566:
567: Regex::matchAll(
568: Regex::delimit($regex, '/'),
569: $string,
570: $matches,
571: );
572:
573: $split = Arr::trim(
574: $matches[0],
575: $characters,
576: $removeEmpty
577: );
578:
579: return $removeEmpty ? $split : array_values($split);
580: }
581:
582: /**
583: * Wrap a string to a given number of characters, optionally varying the
584: * width of the first line
585: *
586: * @param int|array{int,int} $width The number of characters at which the
587: * string will be wrapped, or `[ <first_line_width>, <width> ]`.
588: */
589: public static function wrap(
590: string $string,
591: $width = 75,
592: string $break = "\n",
593: bool $cutLongWords = false
594: ): string {
595: [$delta, $width] = is_array($width)
596: ? [$width[1] - $width[0], $width[1]]
597: : [0, $width];
598:
599: if (!$delta) {
600: return wordwrap($string, $width, $break, $cutLongWords);
601: }
602:
603: // For hanging indents, remove and restore the first $delta characters
604: if ($delta < 0) {
605: return substr($string, 0, -$delta)
606: . wordwrap(substr($string, -$delta), $width, $break, $cutLongWords);
607: }
608:
609: // For first line indents, add and remove $delta characters
610: return substr(
611: wordwrap(str_repeat('x', $delta) . $string, $width, $break, $cutLongWords),
612: $delta,
613: );
614: }
615:
616: /**
617: * Undo wordwrap(), preserving Markdown-style paragraphs and lists
618: *
619: * Non-consecutive line breaks are converted to spaces except before:
620: *
621: * - four or more spaces
622: * - one or more tabs
623: * - Markdown-style list items (e.g. `- item`, `1. item`)
624: *
625: * @param bool $ignoreEscapes If `false`, preserve escaped whitespace.
626: * @param bool $trimLines If `true`, remove whitespace from the end of each
627: * line and between unwrapped lines.
628: * @param bool $collapseBlankLines If `true`, collapse three or more
629: * subsequent line breaks to two.
630: */
631: public static function unwrap(
632: string $string,
633: string $break = "\n",
634: bool $ignoreEscapes = true,
635: bool $trimLines = false,
636: bool $collapseBlankLines = false
637: ): string {
638: $newline = preg_quote($break, '/');
639: $noEscape = $ignoreEscapes ? '' : '(?<!\\\\)(?:\\\\\\\\)*\K';
640:
641: if ($trimLines) {
642: $search[] = "/{$noEscape}\h+({$newline})/";
643: $replace[] = '$1';
644: $between = '\h*';
645: } else {
646: $between = '';
647: }
648:
649: $search[] = "/{$noEscape}(?<!{$newline}|^){$newline}(?!{$newline}|\$| |\\t|(?:[-+*]|[0-9]+[).])\h){$between}/D";
650: $replace[] = ' ';
651:
652: if ($collapseBlankLines) {
653: $search[] = "/(?:{$newline}){3,}/";
654: $replace[] = $break . $break;
655: }
656:
657: return Regex::replace($search, $replace, $string);
658: }
659:
660: /**
661: * Replace whitespace character sequences in a string with a single space
662: */
663: public static function collapse(string $string): string
664: {
665: return Regex::replace('/\s++/', ' ', $string);
666: }
667:
668: /**
669: * Enclose a string between delimiters
670: *
671: * @param string|null $after If `null`, `$before` is used before and after
672: * the string.
673: */
674: public static function enclose(string $string, string $before, ?string $after = null): string
675: {
676: return $before . $string . ($after ?? $before);
677: }
678:
679: /**
680: * Get the Levenshtein distance between two strings relative to the length
681: * of the longest string
682: *
683: * @param bool $normalise If `true`, call {@see Str::normalise()} to
684: * normalise `$string1` and `$string2` for comparison.
685: * @return float A value between `0` and `1`, where `0` means the strings
686: * are identical, and `1` means they have no similarities.
687: */
688: public static function distance(
689: string $string1,
690: string $string2,
691: bool $normalise = false
692: ): float {
693: if ($normalise) {
694: $string1 = self::normalise($string1);
695: $string2 = self::normalise($string2);
696: }
697:
698: if ($string1 === '' && $string2 === '') {
699: return 0.0;
700: }
701:
702: return levenshtein($string1, $string2)
703: / max(strlen($string1), strlen($string2));
704: }
705:
706: /**
707: * Get the similarity of two strings relative to the length of the longest
708: * string
709: *
710: * @param bool $normalise If `true`, call {@see Str::normalise()} to
711: * normalise `$string1` and `$string2` for comparison.
712: * @return float A value between `0` and `1`, where `0` means the strings
713: * have no similarities, and `1` means they are identical.
714: */
715: public static function similarity(
716: string $string1,
717: string $string2,
718: bool $normalise = false
719: ): float {
720: if ($normalise) {
721: $string1 = self::normalise($string1);
722: $string2 = self::normalise($string2);
723: }
724:
725: if ($string1 === '' && $string2 === '') {
726: return 1.0;
727: }
728:
729: return max(
730: similar_text($string1, $string2),
731: similar_text($string2, $string1),
732: ) / max(
733: strlen($string1),
734: strlen($string2),
735: );
736: }
737:
738: /**
739: * Get ngrams shared between two strings relative to the number of
740: * ngrams in the longest string
741: *
742: * @param bool $normalise If `true`, call {@see Str::normalise()} to
743: * normalise `$string1` and `$string2` for comparison.
744: * @return float A value between `0` and `1`, where `0` means the strings
745: * have no shared ngrams, and `1` means their ngrams are identical.
746: */
747: public static function ngramSimilarity(
748: string $string1,
749: string $string2,
750: bool $normalise = false,
751: int $size = 2
752: ): float {
753: return self::ngramScore(true, $string1, $string2, $normalise, $size);
754: }
755:
756: /**
757: * Get ngrams shared between two strings relative to the number of
758: * ngrams in the shortest string
759: *
760: * @param bool $normalise If `true`, call {@see Str::normalise()} to
761: * normalise `$string1` and `$string2` for comparison.
762: * @return float A value between `0` and `1`, where `0` means the strings
763: * have no shared ngrams, and `1` means their ngrams are identical.
764: */
765: public static function ngramIntersection(
766: string $string1,
767: string $string2,
768: bool $normalise = false,
769: int $size = 2
770: ): float {
771: return self::ngramScore(false, $string1, $string2, $normalise, $size);
772: }
773:
774: private static function ngramScore(
775: bool $relativeToLongest,
776: string $string1,
777: string $string2,
778: bool $normalise,
779: int $size
780: ): float {
781: if ($normalise) {
782: $string1 = self::normalise($string1);
783: $string2 = self::normalise($string2);
784: }
785:
786: if (strlen($string1) < $size && strlen($string2) < $size) {
787: return 1.0;
788: }
789:
790: $ngrams1 = self::ngrams($string1, $size);
791: $ngrams2 = self::ngrams($string2, $size);
792: $count = $relativeToLongest
793: ? max(count($ngrams1), count($ngrams2))
794: : min(count($ngrams1), count($ngrams2));
795:
796: $same = 0;
797: foreach ($ngrams1 as $ngram) {
798: $key = array_search($ngram, $ngrams2, true);
799: if ($key !== false) {
800: $same++;
801: unset($ngrams2[$key]);
802: }
803: }
804:
805: return $same / $count;
806: }
807:
808: /**
809: * Get a string's n-grams
810: *
811: * @return string[]
812: */
813: public static function ngrams(string $string, int $size = 2): array
814: {
815: if (strlen($string) < $size) {
816: return [];
817: }
818:
819: $ngrams = [];
820: for ($i = 0; $i < $size; $i++) {
821: $split = $i
822: ? substr($string, $i)
823: : $string;
824: $trim = strlen($split) % $size;
825: if ($trim) {
826: $split = substr($split, 0, -$trim);
827: }
828: if ($split === '') {
829: continue;
830: }
831: $ngrams = array_merge($ngrams, str_split($split, $size));
832: }
833:
834: return $ngrams;
835: }
836:
837: /**
838: * Group lists in a string by heading and remove duplicate items
839: *
840: * - Lines in `$text` are processed in order, from first to last
841: * - If a non-empty line matches `$itemRegex`, it is treated as a list item,
842: * otherwise it becomes the current heading
843: * - The current heading is cleared when an empty line is encountered after
844: * a list item (unless `$loose` is `true`)
845: * - Top-level lines (i.e. headings with no items, and items with no
846: * heading) are returned before lists with headings
847: * - If `$itemRegex` has a named subpattern called `indent` that matches a
848: * non-empty string, subsequent lines with indentation of the same width
849: * are treated as a continuation of the item, along with any empty lines
850: * between them
851: *
852: * @param string $listSeparator Inserted between headings and lists.
853: * @param string|null $headingPrefix Inserted before headings, e.g. `"-"`.
854: * Indentation of the same width is applied to subsequent list items.
855: * @param bool $clean If `true`, remove the first match of `$itemRegex` from
856: * the beginning of each item with no heading.
857: * @param bool $loose If `true`, do not clear the current heading when an
858: * empty line is encountered.
859: * @param bool $discardEmpty If `true`, discard headings with no items.
860: */
861: public static function mergeLists(
862: string $string,
863: string $listSeparator = "\n",
864: ?string $headingPrefix = null,
865: ?string $itemRegex = Str::DEFAULT_ITEM_REGEX,
866: bool $clean = false,
867: bool $loose = false,
868: bool $discardEmpty = false,
869: string $eol = "\n",
870: int $tabSize = 4
871: ): string {
872: $prefix = self::coalesce($headingPrefix, null);
873: $regex = $itemRegex ?? self::DEFAULT_ITEM_REGEX;
874:
875: if ($prefix !== null) {
876: $prefixIsItem = (bool) Regex::match($regex, $prefix);
877: $prefixBytes = strlen($prefix);
878: $indent = str_repeat(' ', mb_strlen($prefix));
879: } else {
880: $indent = '';
881: }
882:
883: $lines = Regex::split('/\r\n|\n|\r/', $string);
884: $count = count($lines);
885: $lists = [];
886: $lastWasItem = false;
887: for ($i = 0; $i < $count; $i++) {
888: $line = $lines[$i];
889:
890: // Remove prefixes to ensure lists with the same heading are merged
891: if (
892: $prefix !== null
893: && !$prefixIsItem
894: && substr($line, 0, $prefixBytes) === $prefix
895: ) {
896: /** @var string */
897: $line = substr($line, $prefixBytes);
898: }
899:
900: // Clear the current heading if this is an empty line after an item
901: if (trim($line) === '') {
902: if (!$loose && $lastWasItem) {
903: unset($list);
904: }
905: continue;
906: }
907:
908: if (Regex::match($regex, $line, $matches, \PREG_OFFSET_CAPTURE)) {
909: // Collect subsequent lines with indentation of the same width
910: if (
911: ($matches['indent'][1] ?? null) === 0
912: && ($itemIndent = $matches['indent'][0]) !== ''
913: ) {
914: $itemIndent = self::expandTabs($itemIndent, $tabSize);
915: $itemIndentBytes = mb_strlen($itemIndent);
916: $itemIndent = str_repeat(' ', $itemIndentBytes);
917: $tentative = '';
918: $backtrack = 0;
919: while ($i < $count - 1) {
920: $nextLine = $lines[$i + 1];
921: if (trim($nextLine) === '') {
922: $tentative .= $nextLine . $eol;
923: $backtrack++;
924: } elseif (substr(self::expandTabs($nextLine, $tabSize), 0, $itemIndentBytes) === $itemIndent) {
925: $line .= $eol . $tentative . $nextLine;
926: $tentative = '';
927: $backtrack = 0;
928: } else {
929: $i -= $backtrack;
930: break;
931: }
932: $i++;
933: }
934: }
935: } else {
936: $list = $line;
937: }
938:
939: $key = $list ?? $line;
940: $lists[$key] ??= [];
941: $lastWasItem = $key !== $line;
942: if ($lastWasItem && !in_array($line, $lists[$key], true)) {
943: $lists[$key][] = $line;
944: }
945: }
946:
947: // Move top-level lines to the top
948: $top = [];
949: $itemList = null;
950: foreach ($lists as $list => $lines) {
951: if (count($lines)) {
952: continue;
953: }
954:
955: unset($lists[$list]);
956:
957: if ($discardEmpty && !Regex::match($regex, $list)) {
958: continue;
959: }
960:
961: if ($clean) {
962: $top[$list] = [];
963: continue;
964: }
965:
966: // Move consecutive top-level items to their own list so
967: // `$listSeparator` isn't inserted between them
968: if (Regex::match($regex, $list)) {
969: if ($itemList !== null) {
970: $top[$itemList][] = $list;
971: continue;
972: }
973: $itemList = $list;
974: } else {
975: $itemList = null;
976: }
977: $top[$list] = [];
978: }
979: $lists = $top + $lists;
980:
981: $merged = [];
982: foreach ($lists as $list => $lines) {
983: if ($clean) {
984: $list = Regex::replace($regex, '', $list, 1);
985: }
986:
987: if (
988: $prefix !== null
989: && !($prefixIsItem && substr($list, 0, $prefixBytes) === $prefix)
990: && !Regex::match($regex, $list)
991: ) {
992: $list = $prefix . $list;
993: $listHasPrefix = true;
994: } else {
995: $listHasPrefix = false;
996: }
997:
998: if (!$lines) {
999: $merged[] = $list;
1000: continue;
1001: }
1002:
1003: // Don't separate or indent consecutive top-level items
1004: if (!$listHasPrefix && Regex::match($regex, $list)) {
1005: $merged[] = implode($eol, [$list, ...$lines]);
1006: continue;
1007: }
1008:
1009: $merged[] = $list;
1010: $merged[] = $indent . implode($eol . $indent, $lines);
1011: }
1012:
1013: return implode($listSeparator, $merged);
1014: }
1015: }
1016: