1: <?php declare(strict_types=1);
2:
3: namespace Salient\Utility;
4:
5: use Salient\Utility\Internal\ListMerger;
6: use Closure;
7: use InvalidArgumentException;
8: use Stringable;
9:
10: /**
11: * Work with strings
12: *
13: * @api
14: */
15: final class Str extends AbstractUtility
16: {
17: public const ALPHANUMERIC = Str::ALPHA . Str::NUMERIC;
18: public const ALPHA = Str::LOWER . Str::UPPER;
19: public const LOWER = 'abcdefghijklmnopqrstuvwxyz';
20: public const UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
21: public const NUMERIC = '0123456789';
22: public const HEX = '0123456789abcdefABCDEF';
23: public const PRESERVE_DOUBLE_QUOTED = 1;
24: public const PRESERVE_SINGLE_QUOTED = 2;
25: public const PRESERVE_QUOTED = Str::PRESERVE_DOUBLE_QUOTED | Str::PRESERVE_SINGLE_QUOTED;
26:
27: public const ASCII_EXTENDED =
28: "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
29: . "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
30: . "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
31: . "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
32: . "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
33: . "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
34: . "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
35: . "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff";
36:
37: /**
38: * Default value of mergeLists() parameter $itemRegex
39: */
40: public const DEFAULT_ITEM_REGEX = '/^(?<indent>\h*[-*] )/';
41:
42: /**
43: * Get the first string that is not null or empty, or return the last value
44: *
45: * @param int|float|string|bool|Stringable|null ...$strings
46: */
47: public static function coalesce(...$strings): ?string
48: {
49: $string = null;
50: foreach ($strings as $string) {
51: if ($string !== null) {
52: $string = (string) $string;
53: if ($string !== '') {
54: return $string;
55: }
56: }
57: }
58: return $string;
59: }
60:
61: /**
62: * Convert ASCII letters in a string to lowercase
63: */
64: public static function lower(string $string): string
65: {
66: return strtr($string, self::UPPER, self::LOWER);
67: }
68:
69: /**
70: * Convert ASCII letters in a string to uppercase
71: */
72: public static function upper(string $string): string
73: {
74: return strtr($string, self::LOWER, self::UPPER);
75: }
76:
77: /**
78: * Make the first character in a string uppercase if it is an ASCII letter
79: */
80: public static function upperFirst(string $string): string
81: {
82: if ($string !== '') {
83: $string[0] = self::upper($string[0]);
84: }
85: return $string;
86: }
87:
88: /**
89: * Match a string's case to another string
90: */
91: public static function matchCase(string $string, string $match): string
92: {
93: $match = trim($match);
94:
95: if ($match === '') {
96: return $string;
97: }
98:
99: $upper = strpbrk($match, self::UPPER);
100: $hasUpper = $upper !== false;
101: $hasLower = strpbrk($match, self::LOWER) !== false;
102:
103: if (strlen($match) === 1) {
104: return $hasLower
105: ? self::lower($string)
106: : ($hasUpper
107: ? self::upperFirst(self::lower($string))
108: : $string);
109: }
110:
111: if ($hasUpper && !$hasLower) {
112: return self::upper($string);
113: }
114:
115: if (!$hasUpper && $hasLower) {
116: return self::lower($string);
117: }
118:
119: // Do nothing if there are no letters, or if there is a mix of cases and
120: // the first letter is not uppercase
121: if ((!$hasUpper && !$hasLower) || $upper !== $match) {
122: return $string;
123: }
124:
125: return self::upperFirst(self::lower($string));
126: }
127:
128: /**
129: * Check if a string starts with a given substring
130: *
131: * @param iterable<string>|string $needles
132: */
133: public static function startsWith(string $haystack, $needles, bool $ignoreCase = false): bool
134: {
135: if (!is_iterable($needles)) {
136: $needles = [$needles];
137: }
138: if ($ignoreCase) {
139: $haystack = self::lower($haystack);
140: $needles = Arr::lower($needles);
141: }
142: foreach ($needles as $needle) {
143: if ($needle !== '' && substr($haystack, 0, strlen($needle)) === $needle) {
144: return true;
145: }
146: }
147: return false;
148: }
149:
150: /**
151: * Check if a string ends with a given substring
152: *
153: * @param iterable<string>|string $needles
154: */
155: public static function endsWith(string $haystack, $needles, bool $ignoreCase = false): bool
156: {
157: if (!is_iterable($needles)) {
158: $needles = [$needles];
159: }
160: if ($ignoreCase) {
161: $haystack = self::lower($haystack);
162: $needles = Arr::lower($needles);
163: }
164: foreach ($needles as $needle) {
165: if ($needle !== '' && substr($haystack, -strlen($needle)) === $needle) {
166: return true;
167: }
168: }
169: return false;
170: }
171:
172: /**
173: * Check if every character in a string has a codepoint between 0 and 127
174: */
175: public static function isAscii(string $string): bool
176: {
177: return strcspn($string, self::ASCII_EXTENDED) === strlen($string);
178: }
179:
180: /**
181: * Escape special characters in a string for use in Markdown
182: */
183: public static function escapeMarkdown(string $string): string
184: {
185: return Regex::replace(
186: <<<'REGEX'
187: / [*<[\\`|] |
188: (?<= [\h[:punct:]] (?: (?<! _ ) | (?<= \G ) ) | ^ ) _ |
189: _ (?= _*+ (?: [\h[:punct:]] | $ | \R ) ) |
190: (?<! ~ ) ~ (?= ~ (?! ~ ) ) |
191: ^ \h* \K (?: > | ~ (?= ~~+ ) | (?: \# {1,6} | [+-] | [0-9]+ \K \. ) (?= \h ) ) /mx
192: REGEX,
193: '\\\\$0',
194: $string,
195: );
196: }
197:
198: /**
199: * Normalise a string for comparison
200: *
201: * The return value of this method is not covered by the Salient toolkit's
202: * backward compatibility promise.
203: */
204: public static function normalise(string $string): string
205: {
206: // 1. Replace "&" with " and "
207: // 2. Remove "."
208: // 3. Replace non-alphanumeric character sequences with " "
209: // 4. Remove leading and trailing whitespace
210: // 5. Convert ASCII characters to uppercase
211: return self::upper(trim(Regex::replace([
212: '/([[:alnum:]][^&]*+)&(?=[^&[:alnum:]]*+[[:alnum:]])/u',
213: '/\.++/',
214: '/[^[:alnum:]]++/u',
215: ], [
216: '$1 and ',
217: '',
218: ' ',
219: ], $string)));
220: }
221:
222: /**
223: * Replace the end of a string with an ellipsis ("...") if its length
224: * exceeds a limit
225: *
226: * @param int<3,max> $length
227: */
228: public static function ellipsize(string $value, int $length): string
229: {
230: if (mb_strlen($value) > $length) {
231: return rtrim(mb_substr($value, 0, $length - 3)) . '...';
232: }
233:
234: return $value;
235: }
236:
237: /**
238: * Apply an end-of-line sequence to a string
239: */
240: public static function setEol(string $string, string $eol = "\n"): string
241: {
242: switch ($eol) {
243: case "\n":
244: return str_replace(["\r\n", "\r"], $eol, $string);
245: case "\r":
246: return str_replace(["\r\n", "\n"], $eol, $string);
247: case "\r\n":
248: return str_replace(["\r\n", "\r", "\n"], ["\n", "\n", $eol], $string);
249: default:
250: return str_replace("\n", $eol, self::setEol($string));
251: }
252: }
253:
254: /**
255: * Remove native end-of-line sequences from the end of a string
256: */
257: public static function trimNativeEol(string $string): string
258: {
259: if (\PHP_EOL === "\n") {
260: $s = rtrim($string, "\n");
261: // Don't remove "\n" from "\r\n"
262: if ($s !== $string && $s !== '' && $s[-1] === "\r") {
263: return "$s\n";
264: }
265: return $s;
266: }
267:
268: $length = strlen(\PHP_EOL);
269: while (substr($string, -$length) === \PHP_EOL) {
270: $string = substr($string, 0, -$length);
271: }
272:
273: return $string;
274: }
275:
276: /**
277: * Replace line feed (LF) characters in a string with the native end-of-line
278: * sequence
279: */
280: public static function eolToNative(string $string): string
281: {
282: return \PHP_EOL === "\n"
283: ? $string
284: : str_replace("\n", \PHP_EOL, $string);
285: }
286:
287: /**
288: * Replace native end-of-line sequences in a string with the line feed (LF)
289: * character
290: */
291: public static function eolFromNative(string $string): string
292: {
293: return \PHP_EOL === "\n"
294: ? $string
295: : str_replace(\PHP_EOL, "\n", $string);
296: }
297:
298: /**
299: * Convert words in a string to snake_case, optionally preserving non-word
300: * characters
301: */
302: public static function snake(string $string, string $preserve = ''): string
303: {
304: return self::lower(self::words($string, '_', $preserve));
305: }
306:
307: /**
308: * Convert words in a string to kebab-case, optionally preserving non-word
309: * characters
310: */
311: public static function kebab(string $string, string $preserve = ''): string
312: {
313: return self::lower(self::words($string, '-', $preserve));
314: }
315:
316: /**
317: * Convert words in a string to camelCase, optionally preserving non-word
318: * characters
319: */
320: public static function camel(string $string, string $preserve = ''): string
321: {
322: return Regex::replaceCallback(
323: '/(?<![[:alnum:]])[[:alpha:]]/u',
324: fn($matches) => self::lower($matches[0]),
325: self::pascal($string, $preserve),
326: );
327: }
328:
329: /**
330: * Convert words in a string to PascalCase, optionally preserving non-word
331: * characters
332: */
333: public static function pascal(string $string, string $preserve = ''): string
334: {
335: return self::words($string, '', $preserve, fn($string) => self::upperFirst(self::lower($string)));
336: }
337:
338: /**
339: * Get words from a string and delimit them with a separator, optionally
340: * preserving non-word characters and applying a callback to each word
341: *
342: * A word consists of one or more letters of the same case, or one uppercase
343: * letter followed by zero or more lowercase letters. Numbers are treated as
344: * lowercase letters except that two or more uppercase letters form one word
345: * with any subsequent numbers.
346: *
347: * @param (Closure(string): string)|null $callback
348: */
349: public static function words(
350: string $string,
351: string $separator = ' ',
352: string $preserve = '',
353: ?Closure $callback = null
354: ): string {
355: $notAfterPreserve = '';
356: if (
357: $preserve !== ''
358: && ($preserve = Regex::replace('/[[:alnum:]]++/u', '', $preserve)) !== ''
359: ) {
360: $preserve = Regex::quoteCharacters($preserve, '/');
361: $preserve = "[:alnum:]{$preserve}";
362: // Prevent "key=value" becoming "key= value" when preserving "=" by
363: // asserting that when separating words, they must appear:
364: // - immediately after the previous word (\G),
365: // - after an unpreserved character, or
366: // - at a word boundary (e.g. "Value" in "key=someValue")
367: if ($separator !== '') {
368: $notAfterPreserve = '(?:\G'
369: . "|(?<=[^{$preserve}])"
370: . '|(?<=[[:lower:][:digit:]])(?=[[:upper:]]))';
371: }
372: } else {
373: $preserve = '[:alnum:]';
374: }
375: $word = '(?:[[:upper:]]?[[:lower:][:digit:]]++'
376: . '|(?:[[:upper:]](?![[:lower:]]))++[[:digit:]]*+)';
377:
378: // Insert separators before words to prevent "foo bar" becoming "foobar"
379: if ($separator !== '') {
380: if (Regex::match("/[{$preserve}]/u", $separator)) {
381: throw new InvalidArgumentException('Invalid separator (preserved characters cannot be used)');
382: }
383: $separator = Regex::quoteReplacement($separator);
384: $string = Regex::replace(
385: "/$notAfterPreserve$word/u",
386: $separator . '$0',
387: $string,
388: );
389: }
390:
391: if ($callback !== null) {
392: $string = Regex::replaceCallback(
393: "/$word/u",
394: fn($matches) => $callback($matches[0]),
395: $string,
396: );
397: }
398:
399: // Trim unpreserved characters from the beginning and end of the string,
400: // then replace sequences of them with one separator
401: return Regex::replace([
402: "/^[^{$preserve}]++|[^{$preserve}]++\$/uD",
403: "/[^{$preserve}]++/u",
404: ], [
405: '',
406: $separator,
407: ], $string);
408: }
409:
410: /**
411: * Expand tabs in a string to spaces
412: *
413: * @param int<1,max> $tabSize
414: * @param int $column The starting column (1-based) of `$text`.
415: */
416: public static function expandTabs(
417: string $string,
418: int $tabSize = 8,
419: int $column = 1
420: ): string {
421: if (strpos($string, "\t") === false) {
422: return $string;
423: }
424: $lines = Regex::split('/(\r\n|\n|\r)/', $string, -1, \PREG_SPLIT_DELIM_CAPTURE);
425: $lines[] = '';
426: $expanded = '';
427: foreach (array_chunk($lines, 2) as [$line, $eol]) {
428: $parts = explode("\t", $line);
429: $last = array_key_last($parts);
430: foreach ($parts as $i => $part) {
431: $expanded .= $part;
432: if ($i === $last) {
433: $expanded .= $eol;
434: break;
435: }
436: $column += mb_strlen($part);
437: // e.g. with $tabSize 4, a tab at $column 2 occupies 3 spaces
438: $spaces = $tabSize - (($column - 1) % $tabSize);
439: $expanded .= str_repeat(' ', $spaces);
440: $column += $spaces;
441: }
442: $column = 1;
443: }
444: return $expanded;
445: }
446:
447: /**
448: * Expand leading tabs in a string to spaces
449: *
450: * @param int<1,max> $tabSize
451: * @param bool $preserveLine1 If `true`, tabs in the first line of `$text`
452: * are not expanded.
453: * @param int $column The starting column (1-based) of `$text`.
454: */
455: public static function expandLeadingTabs(
456: string $string,
457: int $tabSize = 8,
458: bool $preserveLine1 = false,
459: int $column = 1
460: ): string {
461: if (strpos($string, "\t") === false) {
462: return $string;
463: }
464: $lines = Regex::split('/(\r\n|\n|\r)/', $string, -1, \PREG_SPLIT_DELIM_CAPTURE);
465: $lines[] = '';
466: $expanded = '';
467: foreach (array_chunk($lines, 2) as $i => [$line, $eol]) {
468: if (!$i && $preserveLine1) {
469: $expanded .= $line . $eol;
470: $column = 1;
471: continue;
472: }
473: $parts = explode("\t", $line);
474: do {
475: $part = array_shift($parts);
476: $expanded .= $part;
477: if (!$parts) {
478: $expanded .= $eol;
479: break;
480: }
481: if ($part !== '' && trim($part, ' ') !== '') {
482: $expanded .= "\t" . implode("\t", $parts) . $eol;
483: break;
484: }
485: $column += mb_strlen($part);
486: $spaces = $tabSize - (($column - 1) % $tabSize);
487: $expanded .= str_repeat(' ', $spaces);
488: $column += $spaces;
489: } while (true);
490: $column = 1;
491: }
492: return $expanded;
493: }
494:
495: /**
496: * Copy a string to a php://temp stream
497: *
498: * @return resource
499: */
500: public static function toStream(string $string)
501: {
502: $stream = File::open('php://temp', 'r+');
503: File::writeAll($stream, $string);
504: File::rewind($stream);
505: return $stream;
506: }
507:
508: /**
509: * Split a string by a string, trim substrings and remove any empty strings
510: *
511: * @param non-empty-string $separator
512: * @param int|null $limit The maximum number of substrings to return.
513: * Implies `$removeEmpty = false` if not `null`.
514: * @param string|null $characters Characters to trim, `null` (the default)
515: * to trim whitespace, or an empty string to trim nothing.
516: * @return ($limit is null ? ($removeEmpty is true ? list<string> : non-empty-list<string>) : non-empty-list<string>)
517: */
518: public static function split(
519: string $separator,
520: string $string,
521: ?int $limit = null,
522: bool $removeEmpty = true,
523: ?string $characters = null
524: ): array {
525: if ($limit !== null) {
526: $removeEmpty = false;
527: }
528: $split = explode($separator, $string, $limit ?? \PHP_INT_MAX);
529: $split = Arr::trim($split, $characters, $removeEmpty);
530: return $removeEmpty ? $split : array_values($split);
531: }
532:
533: /**
534: * Split a string by a string without splitting bracket-delimited or
535: * double-quoted substrings, trim substrings and remove any empty strings
536: *
537: * @param non-empty-string $separator
538: * @param string|null $characters Characters to trim, `null` (the default)
539: * to trim whitespace, or an empty string to trim nothing.
540: * @param int-mask-of<Str::PRESERVE_*> $flags
541: * @return ($removeEmpty is true ? list<string> : non-empty-list<string>)
542: */
543: public static function splitDelimited(
544: string $separator,
545: string $string,
546: bool $removeEmpty = true,
547: ?string $characters = null,
548: int $flags = Str::PRESERVE_DOUBLE_QUOTED
549: ): array {
550: if (strlen($separator) !== 1) {
551: throw new InvalidArgumentException('Separator must be a single character');
552: }
553:
554: $quotes = '';
555: $regex = '';
556: if ($flags & self::PRESERVE_DOUBLE_QUOTED) {
557: $quotes .= '"';
558: $regex .= ' | " (?: [^"\\\\] | \\\\ . )*+ "';
559: }
560: if ($flags & self::PRESERVE_SINGLE_QUOTED) {
561: $quotes .= "'";
562: $regex .= " | ' (?: [^'\\\\] | \\\\ . )*+ '";
563: }
564:
565: if (strpos('()<>[]{}' . $quotes, $separator) !== false) {
566: throw new InvalidArgumentException('Separator cannot be a delimiter');
567: }
568:
569: $quoted = Regex::quote($separator, '/');
570: $escaped = Regex::quoteCharacters($separator, '/');
571: $regex = <<<REGEX
572: (?x)
573: (?: [^{$quotes}()<>[\]{}{$escaped}]++ |
574: ( \( (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ \) |
575: < (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ > |
576: \[ (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ \] |
577: \{ (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ \}{$regex} ) |
578: # Match empty substrings
579: (?<= $quoted | ^ ) (?= $quoted | \$ ) )+
580: REGEX;
581: $regex = Regex::delimit($regex, '/');
582: Regex::matchAll($regex, $string, $matches);
583: $split = Arr::trim($matches[0], $characters, $removeEmpty);
584:
585: // @phpstan-ignore return.type
586: return $removeEmpty ? $split : array_values($split);
587: }
588:
589: /**
590: * Wrap a string to a given number of characters, optionally varying the
591: * width of the first line
592: *
593: * @param int|array{int,int} $width The number of characters at which the
594: * string will be wrapped, or `[ <first_line_width>, <width> ]`.
595: */
596: public static function wrap(
597: string $string,
598: $width = 75,
599: string $break = "\n",
600: bool $cutLongWords = false
601: ): string {
602: [$delta, $width] = is_array($width)
603: ? [$width[1] - $width[0], $width[1]]
604: : [0, $width];
605:
606: return !$delta
607: ? wordwrap($string, $width, $break, $cutLongWords)
608: : ($delta < 0
609: // For hanging indents, remove and restore $delta characters
610: ? substr($string, 0, -$delta)
611: . wordwrap(substr($string, -$delta), $width, $break, $cutLongWords)
612: // For first line indents, add and remove $delta characters
613: : substr(
614: wordwrap(str_repeat('x', $delta) . $string, $width, $break, $cutLongWords),
615: $delta,
616: ));
617: }
618:
619: /**
620: * Undo wordwrap(), preserving Markdown-style paragraphs and lists
621: *
622: * Non-consecutive line breaks are converted to spaces except before:
623: *
624: * - four or more spaces
625: * - one or more tabs
626: * - Markdown-style list items (e.g. `- item`, `1. item`)
627: *
628: * @param bool $ignoreEscapes If `false`, preserve escaped whitespace.
629: * @param bool $trimLines If `true`, remove whitespace from the end of each
630: * line and between unwrapped lines.
631: * @param bool $collapseBlankLines If `true`, collapse three or more
632: * subsequent line breaks to two.
633: */
634: public static function unwrap(
635: string $string,
636: string $break = "\n",
637: bool $ignoreEscapes = true,
638: bool $trimLines = false,
639: bool $collapseBlankLines = false
640: ): string {
641: $newline = Regex::quote($break, '/');
642: $noEscape = $ignoreEscapes ? '' : '(?<!\\\\)(?:\\\\\\\\)*\K';
643:
644: if ($trimLines) {
645: $search[] = "/{$noEscape}\h+({$newline})/";
646: $replace[] = '$1';
647: $between = '\h*';
648: } else {
649: $between = '';
650: }
651:
652: $search[] = "/{$noEscape}(?<!{$newline}|^){$newline}(?!{$newline}|\$| |\\t|(?:[-+*]|[0-9]+[).])\h){$between}/D";
653: $replace[] = ' ';
654:
655: if ($collapseBlankLines) {
656: $search[] = "/(?:{$newline}){3,}/";
657: $replace[] = $break . $break;
658: }
659:
660: return Regex::replace($search, $replace, $string);
661: }
662:
663: /**
664: * Replace whitespace character sequences in a string with a single space
665: */
666: public static function collapse(string $string): string
667: {
668: return Regex::replace('/\s++/', ' ', $string);
669: }
670:
671: /**
672: * Enclose a string between delimiters
673: *
674: * @param string|null $after If `null`, `$before` is used before and after
675: * the string.
676: */
677: public static function enclose(string $string, string $before, ?string $after = null): string
678: {
679: return $before . $string . ($after ?? $before);
680: }
681:
682: /**
683: * Get the Levenshtein distance between two strings relative to the length
684: * of the longest string
685: *
686: * @return float A value between `0` and `1`, where `0` means the strings
687: * are identical, and `1` means they have no similarities.
688: */
689: public static function distance(
690: string $string1,
691: string $string2,
692: bool $normalise = false
693: ): float {
694: if ($normalise) {
695: $string1 = self::normalise($string1);
696: $string2 = self::normalise($string2);
697: }
698:
699: if ($string1 === '' && $string2 === '') {
700: return 0.0;
701: }
702:
703: return levenshtein($string1, $string2)
704: / max(strlen($string1), strlen($string2));
705: }
706:
707: /**
708: * Get the similarity of two strings relative to the length of the longest
709: * string
710: *
711: * @return float A value between `0` and `1`, where `0` means the strings
712: * have no similarities, and `1` means they are identical.
713: */
714: public static function similarity(
715: string $string1,
716: string $string2,
717: bool $normalise = false
718: ): float {
719: if ($normalise) {
720: $string1 = self::normalise($string1);
721: $string2 = self::normalise($string2);
722: }
723:
724: if ($string1 === '' && $string2 === '') {
725: return 1.0;
726: }
727:
728: return max(
729: similar_text($string1, $string2),
730: similar_text($string2, $string1),
731: ) / max(strlen($string1), strlen($string2));
732: }
733:
734: /**
735: * Get ngrams shared between two strings relative to the number of ngrams in
736: * the longest string
737: *
738: * @return float A value between `0` and `1`, where `0` means the strings
739: * have no shared ngrams, and `1` means their ngrams are identical.
740: */
741: public static function ngramSimilarity(
742: string $string1,
743: string $string2,
744: bool $normalise = false,
745: int $size = 2
746: ): float {
747: return self::ngramScore(true, $string1, $string2, $normalise, $size);
748: }
749:
750: /**
751: * Get ngrams shared between two strings relative to the number of ngrams in
752: * the shortest string
753: *
754: * @return float A value between `0` and `1`, where `0` means the strings
755: * have no shared ngrams, and `1` means their ngrams are identical.
756: */
757: public static function ngramIntersection(
758: string $string1,
759: string $string2,
760: bool $normalise = false,
761: int $size = 2
762: ): float {
763: return self::ngramScore(false, $string1, $string2, $normalise, $size);
764: }
765:
766: private static function ngramScore(
767: bool $relativeToLongest,
768: string $string1,
769: string $string2,
770: bool $normalise,
771: int $size
772: ): float {
773: if ($normalise) {
774: $string1 = self::normalise($string1);
775: $string2 = self::normalise($string2);
776: }
777:
778: if (strlen($string1) < $size && strlen($string2) < $size) {
779: return 1.0;
780: }
781:
782: $ngrams1 = self::ngrams($string1, $size);
783: $ngrams2 = self::ngrams($string2, $size);
784: $count = $relativeToLongest
785: ? max(count($ngrams1), count($ngrams2))
786: : min(count($ngrams1), count($ngrams2));
787:
788: $same = 0;
789: foreach ($ngrams1 as $ngram) {
790: $key = array_search($ngram, $ngrams2, true);
791: if ($key !== false) {
792: $same++;
793: unset($ngrams2[$key]);
794: }
795: }
796:
797: return $same / $count;
798: }
799:
800: /**
801: * Get a string's n-grams
802: *
803: * @return string[]
804: */
805: public static function ngrams(string $string, int $size = 2): array
806: {
807: if (strlen($string) < $size) {
808: return [];
809: }
810:
811: $ngrams = [];
812: for ($i = 0; $i < $size; $i++) {
813: $split = $i
814: ? substr($string, $i)
815: : $string;
816: $trim = strlen($split) % $size;
817: if ($trim) {
818: $split = substr($split, 0, -$trim);
819: }
820: if ($split === '') {
821: continue;
822: }
823: /** @var string[] */
824: $split = str_split($split, $size);
825: $ngrams = array_merge($ngrams, $split);
826: }
827:
828: return $ngrams;
829: }
830:
831: /**
832: * Group lists in a string by heading and remove duplicate items
833: *
834: * - Lines in `$text` are processed in order, from first to last
835: * - If a non-empty line matches `$itemRegex`, it is treated as a list item,
836: * otherwise it becomes the current heading
837: * - The current heading is cleared when an empty line is encountered after
838: * a list item (unless `$loose` is `true`)
839: * - Top-level lines (headings with no items, and items with no heading) are
840: * returned before lists with headings
841: * - If `$itemRegex` has a named subpattern called `indent` that matches a
842: * non-empty string, subsequent lines with indentation of the same width
843: * are treated as a continuation of the item, along with any empty lines
844: * between them
845: *
846: * @param string $listSeparator Inserted between headings and lists.
847: * @param string|null $headingPrefix Inserted before headings, e.g. `"-"`.
848: * Indentation of the same width is applied to subsequent list items.
849: * @param bool $clean If `true`, remove the first match of `$itemRegex` from
850: * the beginning of each item with no heading.
851: * @param bool $loose If `true`, do not clear the current heading when an
852: * empty line is encountered.
853: * @param bool $discardEmpty If `true`, discard headings with no items.
854: * @param int<1,max> $tabSize
855: */
856: public static function mergeLists(
857: string $string,
858: string $listSeparator = "\n",
859: ?string $headingPrefix = null,
860: ?string $itemRegex = Str::DEFAULT_ITEM_REGEX,
861: bool $clean = false,
862: bool $loose = false,
863: bool $discardEmpty = false,
864: string $eol = "\n",
865: int $tabSize = 4
866: ): string {
867: return (new ListMerger(
868: $listSeparator,
869: self::coalesce($headingPrefix, null),
870: $itemRegex ?? self::DEFAULT_ITEM_REGEX,
871: $clean,
872: $loose,
873: $discardEmpty,
874: $eol,
875: $tabSize,
876: ))->merge($string);
877: }
878: }
879: