1: <?php declare(strict_types=1);
2:
3: namespace Salient\Utility;
4:
5: use Closure;
6: use InvalidArgumentException;
7:
8: /**
9: * Work with strings
10: *
11: * @api
12: */
13: final class Str extends AbstractUtility
14: {
15: public const ALPHA = self::LOWER . self::UPPER;
16: public const ALPHANUMERIC = self::ALPHA . self::NUMERIC;
17: public const HEX = '0123456789abcdefABCDEF';
18: public const LOWER = 'abcdefghijklmnopqrstuvwxyz';
19: public const NUMERIC = '0123456789';
20: public const UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
21: public const PRESERVE_DOUBLE_QUOTED = 1;
22: public const PRESERVE_SINGLE_QUOTED = 2;
23: public const PRESERVE_QUOTED = Str::PRESERVE_DOUBLE_QUOTED | Str::PRESERVE_SINGLE_QUOTED;
24: public const ASCII_EXTENDED = "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff";
25:
26: /**
27: * Get the first string that is not null or empty, or return the last value
28: */
29: public static function coalesce(?string ...$strings): ?string
30: {
31: $string = null;
32: foreach ($strings as $string) {
33: if ($string === null || $string === '') {
34: continue;
35: }
36: return $string;
37: }
38: return $string;
39: }
40:
41: /**
42: * Convert an ASCII string to lowercase
43: */
44: public static function lower(string $string): string
45: {
46: return strtr($string, self::UPPER, self::LOWER);
47: }
48:
49: /**
50: * Convert an ASCII string to uppercase
51: */
52: public static function upper(string $string): string
53: {
54: return strtr($string, self::LOWER, self::UPPER);
55: }
56:
57: /**
58: * Make the first character in an ASCII string uppercase
59: */
60: public static function upperFirst(string $string): string
61: {
62: if ($string === '') {
63: return $string;
64: }
65: $string[0] = self::upper($string[0]);
66: return $string;
67: }
68:
69: /**
70: * Match an ASCII string's case to another string
71: */
72: public static function matchCase(string $string, string $match): string
73: {
74: $match = trim($match);
75:
76: if ($match === '') {
77: return $string;
78: }
79:
80: $upper = strpbrk($match, self::UPPER);
81: $hasUpper = $upper !== false;
82: $hasLower = strpbrk($match, self::LOWER) !== false;
83:
84: if ($hasUpper && !$hasLower && strlen($match) > 1) {
85: return self::upper($string);
86: }
87:
88: if (!$hasUpper && $hasLower) {
89: return self::lower($string);
90: }
91:
92: if (
93: // @phpstan-ignore booleanNot.alwaysTrue
94: (!$hasUpper && !$hasLower)
95: || $upper !== $match
96: ) {
97: return $string;
98: }
99:
100: return self::upperFirst(self::lower($string));
101: }
102:
103: /**
104: * Check if a string starts with a given substring
105: *
106: * @param iterable<string>|string $needles
107: */
108: public static function startsWith(string $haystack, $needles, bool $ignoreCase = false): bool
109: {
110: if (!is_iterable($needles)) {
111: $needles = [$needles];
112: }
113: if ($ignoreCase) {
114: $haystack = self::lower($haystack);
115: $needles = Arr::lower($needles);
116: }
117: foreach ($needles as $needle) {
118: if ($needle !== '' && strpos($haystack, $needle) === 0) {
119: return true;
120: }
121: }
122: return false;
123: }
124:
125: /**
126: * Check if a string ends with a given substring
127: *
128: * @param iterable<string>|string $needles
129: */
130: public static function endsWith(string $haystack, $needles, bool $ignoreCase = false): bool
131: {
132: if (!is_iterable($needles)) {
133: $needles = [$needles];
134: }
135: if ($ignoreCase) {
136: $haystack = self::lower($haystack);
137: $needles = Arr::lower($needles);
138: }
139: foreach ($needles as $needle) {
140: if ($needle !== '' && substr($haystack, -strlen($needle)) === $needle) {
141: return true;
142: }
143: }
144: return false;
145: }
146:
147: /**
148: * Check if every character in a string has a codepoint between 0 and 127
149: */
150: public static function isAscii(string $string): bool
151: {
152: return strcspn($string, self::ASCII_EXTENDED) === strlen($string);
153: }
154:
155: /**
156: * Normalise a string for comparison
157: *
158: * The return value of this method is not covered by the Salient toolkit's
159: * backward compatibility promise.
160: */
161: public static function normalise(string $string): string
162: {
163: // 4. Remove leading and trailing whitespace
164: // 5. Convert ASCII characters to uppercase
165: return self::upper(trim(Regex::replace([
166: // 1. Replace "&" with " and "
167: '/([[:alnum:]][^&]*+)&(?=[^&[:alnum:]]*+[[:alnum:]])/u',
168: // 2. Remove "."
169: '/\.++/',
170: // 3. Replace non-alphanumeric character sequences with " "
171: '/[^[:alnum:]]++/u',
172: ], [
173: '$1 and ',
174: '',
175: ' ',
176: ], $string)));
177: }
178:
179: /**
180: * Replace the end of a string with an ellipsis ("...") if its length
181: * exceeds a limit
182: */
183: public static function ellipsize(string $value, int $length): string
184: {
185: if ($length < 3) {
186: $length = 3;
187: }
188: if (mb_strlen($value) > $length) {
189: return rtrim(mb_substr($value, 0, $length - 3)) . '...';
190: }
191:
192: return $value;
193: }
194:
195: /**
196: * Apply an end-of-line sequence to a string
197: */
198: public static function setEol(string $string, string $eol = "\n"): string
199: {
200: switch ($eol) {
201: case "\n":
202: return str_replace(["\r\n", "\r"], $eol, $string);
203:
204: case "\r":
205: return str_replace(["\r\n", "\n"], $eol, $string);
206:
207: case "\r\n":
208: return str_replace(["\r\n", "\r", "\n"], ["\n", "\n", $eol], $string);
209:
210: default:
211: return str_replace("\n", $eol, self::setEol($string));
212: }
213: }
214:
215: /**
216: * Remove native end-of-line sequences from the end of a string
217: */
218: public static function trimNativeEol(string $string): string
219: {
220: if (\PHP_EOL === "\n") {
221: $s = rtrim($string, "\n");
222: if ($s === $string || $s === '' || $s[-1] !== "\r") {
223: return $s;
224: }
225: return "$s\n";
226: }
227:
228: $length = strlen(\PHP_EOL);
229: while (substr($string, -$length) === \PHP_EOL) {
230: $string = substr($string, 0, -$length);
231: }
232:
233: return $string;
234: }
235:
236: /**
237: * Replace newline characters in a string with the native end-of-line
238: * sequence
239: */
240: public static function eolToNative(string $string): string
241: {
242: return \PHP_EOL === "\n"
243: ? $string
244: : str_replace("\n", \PHP_EOL, $string);
245: }
246:
247: /**
248: * Replace native end-of-line sequences in a string with the newline
249: * character
250: */
251: public static function eolFromNative(string $string): string
252: {
253: return \PHP_EOL === "\n"
254: ? $string
255: : str_replace(\PHP_EOL, "\n", $string);
256: }
257:
258: /**
259: * Convert words in an arbitrarily capitalised string to snake_case,
260: * optionally preserving non-word characters
261: */
262: public static function snake(string $string, string $preserve = ''): string
263: {
264: return self::lower(self::words($string, '_', $preserve));
265: }
266:
267: /**
268: * Convert words in an arbitrarily capitalised string to kebab-case,
269: * optionally preserving non-word characters
270: */
271: public static function kebab(string $string, string $preserve = ''): string
272: {
273: return self::lower(self::words($string, '-', $preserve));
274: }
275:
276: /**
277: * Convert words in an arbitrarily capitalised string to camelCase,
278: * optionally preserving non-word characters
279: */
280: public static function camel(string $string, string $preserve = ''): string
281: {
282: return Regex::replaceCallback(
283: '/(?<![[:alnum:]])[[:alpha:]]/u',
284: fn($matches) => self::lower($matches[0]),
285: self::pascal($string, $preserve),
286: );
287: }
288:
289: /**
290: * Convert words in an arbitrarily capitalised string to PascalCase,
291: * optionally preserving non-word characters
292: */
293: public static function pascal(string $string, string $preserve = ''): string
294: {
295: return self::words($string, '', $preserve, fn($string) => self::upperFirst(self::lower($string)));
296: }
297:
298: /**
299: * Get words from an arbitrarily capitalised string and delimit them with a
300: * separator, optionally preserving non-word characters and applying a
301: * callback to each word
302: *
303: * A word consists of one or more letters of the same case, or one uppercase
304: * letter followed by zero or more lowercase letters. Numbers are treated as
305: * lowercase letters except that two or more uppercase letters form one word
306: * with any subsequent numbers.
307: *
308: * @param string $preserve Non-alphanumeric characters to preserve.
309: * @param (Closure(string): string)|null $callback
310: */
311: public static function words(
312: string $string,
313: string $separator = ' ',
314: string $preserve = '',
315: ?Closure $callback = null
316: ): string {
317: $notAfterPreserve = '';
318: if ($preserve !== '') {
319: $preserve = Regex::replace('/[[:alnum:]]++/u', '', $preserve);
320: if ($preserve !== '') {
321: $preserve = Regex::quoteCharacterClass($preserve, '/');
322: // Prevent "key=value" becoming "key= value" when preserving "="
323: // by asserting that when separating words, they must appear:
324: // - immediately after the previous word (\G),
325: // - after an unpreserved character, or
326: // - at a word boundary (e.g. "Value" in "key=someValue")
327: if ($separator !== '') {
328: $notAfterPreserve = '(?:\G'
329: . "|(?<=[^[:alnum:]{$preserve}])"
330: . '|(?<=[[:lower:][:digit:]])(?=[[:upper:]]))';
331: }
332: }
333: }
334: $preserve = "[:alnum:]{$preserve}";
335: $word = '(?:[[:upper:]]?[[:lower:][:digit:]]++'
336: . '|(?:[[:upper:]](?![[:lower:]]))++[[:digit:]]*+)';
337:
338: // Insert separators before words to prevent "foo bar" becoming "foobar"
339: if ($separator !== '') {
340: if (Regex::match("/[{$preserve}]/u", $separator)) {
341: throw new InvalidArgumentException('Invalid separator (preserved characters cannot be used)');
342: }
343: $separator = Regex::quoteReplacement($separator);
344: $string = Regex::replace(
345: "/$notAfterPreserve$word/u",
346: $separator . '$0',
347: $string,
348: );
349: }
350:
351: if ($callback !== null) {
352: $string = Regex::replaceCallback(
353: "/$word/u",
354: fn($match) => $callback($match[0]),
355: $string,
356: );
357: }
358:
359: // Trim unpreserved characters from the beginning and end of the string,
360: // then replace sequences of them with one separator
361: return Regex::replace([
362: "/^[^{$preserve}]++|[^{$preserve}]++\$/Du",
363: "/[^{$preserve}]++/u",
364: ], [
365: '',
366: $separator,
367: ], $string);
368: }
369:
370: /**
371: * Expand tabs in a string to spaces
372: *
373: * @param int $column The starting column (1-based) of `$text`.
374: */
375: public static function expandTabs(
376: string $text,
377: int $tabSize = 8,
378: int $column = 1
379: ): string {
380: if (strpos($text, "\t") === false) {
381: return $text;
382: }
383: $eol = Get::eol($text) ?? "\n";
384: $expanded = '';
385: foreach (explode($eol, $text) as $i => $line) {
386: !$i || $expanded .= $eol;
387: $parts = explode("\t", $line);
388: $last = array_key_last($parts);
389: foreach ($parts as $p => $part) {
390: $expanded .= $part;
391: if ($p === $last) {
392: break;
393: }
394: $column += mb_strlen($part);
395: // e.g. with $tabSize 4, a tab at $column 2 occupies 3 spaces
396: $spaces = $tabSize - (($column - 1) % $tabSize);
397: $expanded .= str_repeat(' ', $spaces);
398: $column += $spaces;
399: }
400: $column = 1;
401: }
402: return $expanded;
403: }
404:
405: /**
406: * Expand leading tabs in a string to spaces
407: *
408: * @param bool $preserveLine1 If `true`, tabs in the first line of `$text`
409: * are not expanded.
410: * @param int $column The starting column (1-based) of `$text`.
411: */
412: public static function expandLeadingTabs(
413: string $text,
414: int $tabSize = 8,
415: bool $preserveLine1 = false,
416: int $column = 1
417: ): string {
418: if (strpos($text, "\t") === false) {
419: return $text;
420: }
421: $eol = Get::eol($text) ?? "\n";
422: $softTab = str_repeat(' ', $tabSize);
423: $expanded = '';
424: foreach (explode($eol, $text) as $i => $line) {
425: !$i || $expanded .= $eol;
426: if ($i || (!$preserveLine1 && $column === 1)) {
427: $expanded .= Regex::replace('/(?<=\n|\G)\t/', $softTab, $line);
428: continue;
429: }
430: if ($preserveLine1) {
431: $expanded .= $line;
432: continue;
433: }
434: $parts = explode("\t", $line);
435: while (($part = array_shift($parts)) !== null) {
436: $expanded .= $part;
437: if (!$parts) {
438: break;
439: }
440: if ($part !== '') {
441: $expanded .= "\t" . implode("\t", $parts);
442: break;
443: }
444: $column += mb_strlen($part);
445: $spaces = $tabSize - (($column - 1) % $tabSize);
446: $expanded .= str_repeat(' ', $spaces);
447: $column += $spaces;
448: }
449: }
450: return $expanded;
451: }
452:
453: /**
454: * Copy a string to a temporary stream
455: *
456: * @return resource
457: */
458: public static function toStream(string $string)
459: {
460: $stream = File::open('php://temp', 'r+');
461: File::write($stream, $string);
462: File::rewind($stream);
463: return $stream;
464: }
465:
466: /**
467: * Split a string by a string and remove whitespace from the beginning and
468: * end of each substring before removing empty strings
469: *
470: * @param non-empty-string $separator
471: * @param int|null $limit Limit the number of substrings returned. Implies
472: * `$removeEmpty = false`.
473: * @param string|null $characters Specify characters to trim instead of
474: * whitespace. If an empty string is given, substrings are not trimmed.
475: * @return list<string>
476: */
477: public static function split(
478: string $separator,
479: string $string,
480: ?int $limit = null,
481: bool $removeEmpty = true,
482: ?string $characters = null
483: ): array {
484: if ($limit !== null) {
485: $removeEmpty = false;
486: }
487: $split = Arr::trim(
488: explode($separator, $string, $limit ?? \PHP_INT_MAX),
489: $characters,
490: $removeEmpty
491: );
492: return $removeEmpty ? $split : array_values($split);
493: }
494:
495: /**
496: * Without splitting bracket-delimited or double-quoted substrings, split a
497: * string by a string and remove whitespace from the beginning and end of
498: * each substring before optionally removing empty strings
499: *
500: * @param non-empty-string $separator
501: * @param string|null $characters Specify characters to trim instead of
502: * whitespace. If an empty string is given, substrings are not trimmed.
503: * @param int-mask-of<Str::PRESERVE_*> $flags
504: * @return ($removeEmpty is true ? list<string> : non-empty-list<string>)
505: */
506: public static function splitDelimited(
507: string $separator,
508: string $string,
509: bool $removeEmpty = false,
510: ?string $characters = null,
511: int $flags = Str::PRESERVE_DOUBLE_QUOTED
512: ): array {
513: if (strlen($separator) !== 1) {
514: throw new InvalidArgumentException('Separator must be a single character');
515: }
516:
517: $quotes = '';
518: $regex = '';
519: if ($flags & self::PRESERVE_DOUBLE_QUOTED) {
520: $quotes .= '"';
521: $regex .= "|\n" . ' " (?: [^"\\\\] | \\\\ . )*+ " ';
522: }
523: if ($flags & self::PRESERVE_SINGLE_QUOTED) {
524: $quotes .= "'";
525: $regex .= "|\n" . " ' (?: [^'\\\\] | \\\\ . )*+ ' ";
526: }
527:
528: if (strpos('()<>[]{}' . $quotes, $separator) !== false) {
529: throw new InvalidArgumentException('Separator cannot be a delimiter');
530: }
531:
532: $quoted = preg_quote($separator, '/');
533: $escaped = Regex::quoteCharacterClass($separator, '/');
534:
535: $regex = <<<REGEX
536: (?x)
537: (?: [^{$quotes}()<>[\]{}{$escaped}]++ |
538: ( \( (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ \) |
539: < (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ > |
540: \[ (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ \] |
541: \{ (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ \} {$regex}) |
542: # Match empty substrings
543: (?<= $quoted | ^ ) (?= $quoted | \$ ) )+
544: REGEX;
545:
546: Regex::matchAll(
547: Regex::delimit($regex, '/'),
548: $string,
549: $matches,
550: );
551:
552: $split = Arr::trim(
553: $matches[0],
554: $characters,
555: $removeEmpty
556: );
557:
558: return $removeEmpty ? $split : array_values($split);
559: }
560:
561: /**
562: * Wrap a string to a given number of characters, optionally varying the
563: * widths of the second and subsequent lines from the first
564: *
565: * If `$width` is an `array`, the first line of text is wrapped to the first
566: * value, and text in subsequent lines is wrapped to the second value.
567: *
568: * @param array{int,int}|int $width
569: */
570: public static function wrap(
571: string $string,
572: $width = 75,
573: string $break = "\n",
574: bool $cutLongWords = false
575: ): string {
576: [$delta, $width] = is_array($width)
577: ? [$width[1] - $width[0], $width[1]]
578: : [0, $width];
579:
580: if (!$delta) {
581: return wordwrap($string, $width, $break, $cutLongWords);
582: }
583:
584: // For hanging indents, remove and restore the first $delta characters
585: if ($delta < 0) {
586: return substr($string, 0, -$delta)
587: . wordwrap(substr($string, -$delta), $width, $break, $cutLongWords);
588: }
589:
590: // For first line indents, add and remove $delta characters
591: return substr(
592: wordwrap(str_repeat('x', $delta) . $string, $width, $break, $cutLongWords),
593: $delta
594: );
595: }
596:
597: /**
598: * Undo wordwrap(), preserving Markdown-style paragraphs and lists
599: *
600: * Non-consecutive line breaks are converted to spaces unless they precede
601: * one of the following:
602: *
603: * - four or more spaces
604: * - one or more tabs
605: * - a Markdown-style list item (e.g. `- item`, `1. item`)
606: *
607: * If `$ignoreEscapes` is `false`, whitespace escaped with a backslash is
608: * preserved.
609: *
610: * If `$trimTrailingWhitespace` is `true`, whitespace is removed from the
611: * end of each line, and if `$collapseBlankLines` is `true`, three or more
612: * subsequent line breaks are collapsed to two.
613: */
614: public static function unwrap(
615: string $string,
616: string $break = "\n",
617: bool $ignoreEscapes = true,
618: bool $trimTrailingWhitespace = false,
619: bool $collapseBlankLines = false
620: ): string {
621: $newline = preg_quote($break, '/');
622: $escapes = $ignoreEscapes ? '' : '(?<!\\\\)(?:\\\\\\\\)*\K';
623:
624: if ($trimTrailingWhitespace) {
625: $search[] = "/{$escapes}\h+{$newline}/";
626: $replace[] = $break;
627: }
628:
629: $search[] = "/{$escapes}(?<!{$newline}){$newline}(?!{$newline}| |\\t|(?:[-+*]|[0-9]+[).])\h)/";
630: $replace[] = ' ';
631:
632: if ($collapseBlankLines) {
633: $search[] = "/(?:{$newline}){3,}/";
634: $replace[] = $break . $break;
635: }
636:
637: return Regex::replace($search, $replace, $string);
638: }
639:
640: /**
641: * Enclose a string between delimiters
642: *
643: * @param string|null $after If `null`, `$before` is used before and after
644: * the string.
645: */
646: public static function enclose(string $string, string $before, ?string $after = null): string
647: {
648: return $before . $string . ($after ?? $before);
649: }
650:
651: /**
652: * Get the Levenshtein distance between two strings relative to the length
653: * of the longest string
654: *
655: * @param bool $normalise If true, normalise `$string1` and `$string2` with
656: * {@see Str::normalise()} before comparing them.
657: * @return float A value between `0` and `1`, where `0` means the strings
658: * are identical, and `1` means they have no similarities.
659: */
660: public static function distance(
661: string $string1,
662: string $string2,
663: bool $normalise = true
664: ): float {
665: if ($string1 === '' && $string2 === '') {
666: return 0.0;
667: }
668:
669: if ($normalise) {
670: $string1 = self::normalise($string1);
671: $string2 = self::normalise($string2);
672: }
673:
674: return
675: levenshtein($string1, $string2)
676: / max(strlen($string1), strlen($string2));
677: }
678:
679: /**
680: * Get the similarity of two strings relative to the length of the longest
681: * string
682: *
683: * @param bool $normalise If true, normalise `$string1` and `$string2` with
684: * {@see Str::normalise()} before comparing them.
685: * @return float A value between `0` and `1`, where `0` means the strings
686: * have no similarities, and `1` means they are identical.
687: */
688: public static function similarity(
689: string $string1,
690: string $string2,
691: bool $normalise = true
692: ): float {
693: if ($string1 === '' && $string2 === '') {
694: return 1.0;
695: }
696:
697: if ($normalise) {
698: $string1 = self::normalise($string1);
699: $string2 = self::normalise($string2);
700: }
701:
702: return
703: max(
704: similar_text($string1, $string2),
705: similar_text($string2, $string1),
706: ) / max(
707: strlen($string1),
708: strlen($string2),
709: );
710: }
711:
712: /**
713: * Get the ngrams shared between two strings relative to the number of
714: * ngrams in the longest string
715: *
716: * @param bool $normalise If true, normalise `$string1` and `$string2` with
717: * {@see Str::normalise()} before comparing them.
718: * @return float A value between `0` and `1`, where `0` means the strings
719: * have no shared ngrams, and `1` means their ngrams are identical.
720: */
721: public static function ngramSimilarity(
722: string $string1,
723: string $string2,
724: bool $normalise = true,
725: int $size = 2
726: ): float {
727: return self::ngramScore(true, $string1, $string2, $normalise, $size);
728: }
729:
730: /**
731: * Get the ngrams shared between two strings relative to the number of
732: * ngrams in the shortest string
733: *
734: * @param bool $normalise If true, normalise `$string1` and `$string2` with
735: * {@see Str::normalise()} before comparing them.
736: * @return float A value between `0` and `1`, where `0` means the strings
737: * have no shared ngrams, and `1` means their ngrams are identical.
738: */
739: public static function ngramIntersection(
740: string $string1,
741: string $string2,
742: bool $normalise = true,
743: int $size = 2
744: ): float {
745: return self::ngramScore(false, $string1, $string2, $normalise, $size);
746: }
747:
748: private static function ngramScore(
749: bool $relativeToLongest,
750: string $string1,
751: string $string2,
752: bool $normalise,
753: int $size
754: ): float {
755: if (strlen($string1) < $size && strlen($string2) < $size) {
756: return 1.0;
757: }
758:
759: if ($normalise) {
760: $string1 = self::normalise($string1);
761: $string2 = self::normalise($string2);
762: }
763:
764: $ngrams1 = self::ngrams($string1, $size);
765: $ngrams2 = self::ngrams($string2, $size);
766: $count =
767: $relativeToLongest
768: ? max(count($ngrams1), count($ngrams2))
769: : min(count($ngrams1), count($ngrams2));
770:
771: $same = 0;
772: foreach ($ngrams1 as $ngram) {
773: $key = array_search($ngram, $ngrams2, true);
774: if ($key !== false) {
775: $same++;
776: unset($ngrams2[$key]);
777: }
778: }
779:
780: return $same / $count;
781: }
782:
783: /**
784: * Get a string's n-grams
785: *
786: * @return string[]
787: */
788: public static function ngrams(string $string, int $size = 2): array
789: {
790: if (strlen($string) < $size) {
791: return [];
792: }
793:
794: $ngrams = [];
795: for ($i = 0; $i < $size; $i++) {
796: $split = $i
797: ? substr($string, $i)
798: : $string;
799: $trim = strlen($split) % $size;
800: if ($trim) {
801: $split = substr($split, 0, -$trim);
802: }
803: if ($split === '') {
804: continue;
805: }
806: $ngrams = array_merge($ngrams, str_split($split, $size));
807: }
808:
809: return $ngrams;
810: }
811:
812: /**
813: * Remove duplicates in a string where top-level lines ("sections") are
814: * grouped with "list items" below
815: *
816: * Lines that match `$regex` are regarded as list items, and other lines are
817: * used as the section name for subsequent list items. If `$loose` is
818: * `false` (the default), blank lines between list items clear the current
819: * section name.
820: *
821: * Top-level lines with no children, including any list items orphaned by
822: * blank lines above them, are returned before sections with children.
823: *
824: * If a named subpattern in `$regex` called `indent` matches a non-empty
825: * string, subsequent lines with the same number of spaces for indentation
826: * as there are characters in the match are treated as part of the item,
827: * including any blank lines.
828: *
829: * Line endings used in `$text` may be any combination of LF, CRLF and CR,
830: * but LF (`"\n"`) line endings are used in the return value.
831: *
832: * @param string $separator Used between top-level lines and sections. Has
833: * no effect on the end-of-line sequence used between items, which is always
834: * LF (`"\n"`).
835: * @param string|null $marker Added before each section name. Nested list
836: * items are indented by the equivalent number of spaces. To add a leading
837: * `"- "` to top-level lines and indent others with two spaces, set
838: * `$marker` to `"-"`.
839: * @param bool $clean If `true`, the first match of `$regex` in each section
840: * name is removed.
841: * @param bool $loose If `true`, blank lines between list items are ignored.
842: */
843: public static function mergeLists(
844: string $text,
845: string $separator = "\n",
846: ?string $marker = null,
847: string $regex = '/^(?<indent>\h*[-*] )/',
848: bool $clean = false,
849: bool $loose = false
850: ): string {
851: $marker = (string) $marker !== '' ? $marker . ' ' : null;
852: $indent = $marker !== null ? str_repeat(' ', mb_strlen($marker)) : '';
853: $markerIsItem = $marker !== null && Regex::match($regex, $marker);
854:
855: /** @var array<string,string[]> */
856: $sections = [];
857: $lastWasItem = false;
858: $lines = Regex::split('/\r\n|\n|\r/', $text);
859: for ($i = 0; $i < count($lines); $i++) {
860: $line = $lines[$i];
861:
862: // Remove pre-existing markers early to ensure sections with the
863: // same name are combined
864: if ($marker !== null && !$markerIsItem && strpos($line, $marker) === 0) {
865: $line = substr($line, strlen($marker));
866: }
867:
868: // Treat blank lines between items as section breaks
869: if (trim($line) === '') {
870: if (!$loose && $lastWasItem) {
871: unset($section);
872: }
873: continue;
874: }
875:
876: // Collect any subsequent indented lines
877: if (Regex::match($regex, $line, $matches)) {
878: $matchIndent = $matches['indent'] ?? '';
879: if ($matchIndent !== '') {
880: $matchIndent = str_repeat(' ', mb_strlen($matchIndent));
881: $pendingWhitespace = '';
882: $backtrack = 0;
883: while ($i < count($lines) - 1) {
884: $nextLine = $lines[$i + 1];
885: if (trim($nextLine) === '') {
886: $pendingWhitespace .= $nextLine . "\n";
887: $backtrack++;
888: } elseif (substr($nextLine, 0, strlen($matchIndent)) === $matchIndent) {
889: $line .= "\n" . $pendingWhitespace . $nextLine;
890: $pendingWhitespace = '';
891: $backtrack = 0;
892: } else {
893: $i -= $backtrack;
894: break;
895: }
896: $i++;
897: }
898: }
899: } else {
900: $section = $line;
901: }
902:
903: $key = $section ?? $line;
904:
905: if (!array_key_exists($key, $sections)) {
906: $sections[$key] = [];
907: }
908:
909: if ($key !== $line) {
910: if (!in_array($line, $sections[$key])) {
911: $sections[$key][] = $line;
912: }
913: $lastWasItem = true;
914: } else {
915: $lastWasItem = false;
916: }
917: }
918:
919: // Move lines with no associated list to the top
920: /** @var array<string,string[]> */
921: $top = [];
922: $last = null;
923: foreach ($sections as $section => $lines) {
924: if (count($lines)) {
925: continue;
926: }
927:
928: unset($sections[$section]);
929:
930: if ($clean) {
931: $top[$section] = [];
932: continue;
933: }
934:
935: // Collect second and subsequent consecutive top-level list items
936: // under the first so they don't form a loose list
937: if (Regex::match($regex, $section)) {
938: if ($last !== null) {
939: $top[$last][] = $section;
940: continue;
941: }
942: $last = $section;
943: } else {
944: $last = null;
945: }
946: $top[$section] = [];
947: }
948: /** @var array<string,string[]> */
949: $sections = array_merge($top, $sections);
950:
951: $groups = [];
952: foreach ($sections as $section => $lines) {
953: if ($clean) {
954: $section = Regex::replace($regex, '', $section, 1);
955: }
956:
957: $marked = false;
958: if ($marker !== null
959: && !($markerIsItem && strpos($section, $marker) === 0)
960: && !Regex::match($regex, $section)) {
961: $section = $marker . $section;
962: $marked = true;
963: }
964:
965: if (!$lines) {
966: $groups[] = $section;
967: continue;
968: }
969:
970: // Don't separate or indent top-level list items collected above
971: if (!$marked && Regex::match($regex, $section)) {
972: $groups[] = implode("\n", [$section, ...$lines]);
973: continue;
974: }
975:
976: $groups[] = $section;
977: $groups[] = $indent . implode("\n" . $indent, $lines);
978: }
979:
980: return implode($separator, $groups);
981: }
982: }
983: