1: <?php declare(strict_types=1);
2:
3: namespace Salient\Utility;
4:
5: use Salient\Utility\Internal\ListMerger;
6: use Closure;
7: use InvalidArgumentException;
8: use Stringable;
9:
10: /**
11: * Work with strings
12: *
13: * @api
14: */
15: final class Str extends AbstractUtility
16: {
17: public const ALPHANUMERIC = Str::ALPHA . Str::NUMERIC;
18: public const ALPHA = Str::LOWER . Str::UPPER;
19: public const LOWER = 'abcdefghijklmnopqrstuvwxyz';
20: public const UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
21: public const NUMERIC = '0123456789';
22: public const HEX = '0123456789abcdefABCDEF';
23: public const PRESERVE_DOUBLE_QUOTED = 1;
24: public const PRESERVE_SINGLE_QUOTED = 2;
25: public const PRESERVE_QUOTED = Str::PRESERVE_DOUBLE_QUOTED | Str::PRESERVE_SINGLE_QUOTED;
26:
27: public const ASCII_EXTENDED =
28: "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
29: . "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
30: . "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
31: . "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
32: . "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
33: . "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
34: . "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
35: . "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff";
36:
37: /**
38: * Default value of mergeLists() parameter $itemRegex
39: */
40: public const DEFAULT_ITEM_REGEX = '/^(?<indent>\h*[-*] )/';
41:
42: private const BASE32_INDEX = ['A' => 0, 'B' => 1, 'C' => 2, 'D' => 3, 'E' => 4, 'F' => 5, 'G' => 6, 'H' => 7, 'I' => 8, 'J' => 9, 'K' => 10, 'L' => 11, 'M' => 12, 'N' => 13, 'O' => 14, 'P' => 15, 'Q' => 16, 'R' => 17, 'S' => 18, 'T' => 19, 'U' => 20, 'V' => 21, 'W' => 22, 'X' => 23, 'Y' => 24, 'Z' => 25, '2' => 26, '3' => 27, '4' => 28, '5' => 29, '6' => 30, '7' => 31];
43:
44: /**
45: * Get the first string that is not null or empty, or return the last value
46: *
47: * @param int|float|string|bool|Stringable|null ...$strings
48: */
49: public static function coalesce(...$strings): ?string
50: {
51: $string = null;
52: foreach ($strings as $string) {
53: if ($string !== null) {
54: $string = (string) $string;
55: if ($string !== '') {
56: return $string;
57: }
58: }
59: }
60: return $string;
61: }
62:
63: /**
64: * Convert ASCII letters in a string to lowercase
65: */
66: public static function lower(string $string): string
67: {
68: return strtr($string, self::UPPER, self::LOWER);
69: }
70:
71: /**
72: * Convert ASCII letters in a string to uppercase
73: */
74: public static function upper(string $string): string
75: {
76: return strtr($string, self::LOWER, self::UPPER);
77: }
78:
79: /**
80: * Make the first character in a string uppercase if it is an ASCII letter
81: */
82: public static function upperFirst(string $string): string
83: {
84: if ($string !== '') {
85: $string[0] = self::upper($string[0]);
86: }
87: return $string;
88: }
89:
90: /**
91: * Match a string's case to another string
92: */
93: public static function matchCase(string $string, string $match): string
94: {
95: $match = trim($match);
96:
97: if ($match === '') {
98: return $string;
99: }
100:
101: $upper = strpbrk($match, self::UPPER);
102: $hasUpper = $upper !== false;
103: $hasLower = strpbrk($match, self::LOWER) !== false;
104:
105: if (strlen($match) === 1) {
106: return $hasLower
107: ? self::lower($string)
108: : ($hasUpper
109: ? self::upperFirst(self::lower($string))
110: : $string);
111: }
112:
113: if ($hasUpper && !$hasLower) {
114: return self::upper($string);
115: }
116:
117: if (!$hasUpper && $hasLower) {
118: return self::lower($string);
119: }
120:
121: // Do nothing if there are no letters, or if there is a mix of cases and
122: // the first letter is not uppercase
123: if ((!$hasUpper && !$hasLower) || $upper !== $match) {
124: return $string;
125: }
126:
127: return self::upperFirst(self::lower($string));
128: }
129:
130: /**
131: * Check if a string starts with a given substring
132: *
133: * @param iterable<string>|string $needles
134: */
135: public static function startsWith(string $haystack, $needles, bool $ignoreCase = false): bool
136: {
137: if (!is_iterable($needles)) {
138: $needles = [$needles];
139: }
140: if ($ignoreCase) {
141: $haystack = self::lower($haystack);
142: $needles = Arr::lower($needles);
143: }
144: foreach ($needles as $needle) {
145: if ($needle !== '' && substr($haystack, 0, strlen($needle)) === $needle) {
146: return true;
147: }
148: }
149: return false;
150: }
151:
152: /**
153: * Check if a string ends with a given substring
154: *
155: * @param iterable<string>|string $needles
156: */
157: public static function endsWith(string $haystack, $needles, bool $ignoreCase = false): bool
158: {
159: if (!is_iterable($needles)) {
160: $needles = [$needles];
161: }
162: if ($ignoreCase) {
163: $haystack = self::lower($haystack);
164: $needles = Arr::lower($needles);
165: }
166: foreach ($needles as $needle) {
167: if ($needle !== '' && substr($haystack, -strlen($needle)) === $needle) {
168: return true;
169: }
170: }
171: return false;
172: }
173:
174: /**
175: * Check if every character in a string has a codepoint between 0 and 127
176: */
177: public static function isAscii(string $string): bool
178: {
179: return strcspn($string, self::ASCII_EXTENDED) === strlen($string);
180: }
181:
182: /**
183: * Escape special characters in a string for use in Markdown
184: */
185: public static function escapeMarkdown(string $string): string
186: {
187: return Regex::replace(
188: <<<'REGEX'
189: / [*<[\\`|] |
190: (?<= [\h[:punct:]] (?: (?<! _ ) | (?<= \G ) ) | ^ ) _ |
191: _ (?= _*+ (?: [\h[:punct:]] | $ | \R ) ) |
192: (?<! ~ ) ~ (?= ~ (?! ~ ) ) |
193: ^ \h* \K (?: > | ~ (?= ~~+ ) | (?: \# {1,6} | [+-] | [0-9]+ \K \. ) (?= \h ) ) /mx
194: REGEX,
195: '\\\\$0',
196: $string,
197: );
198: }
199:
200: /**
201: * Normalise a string for comparison
202: *
203: * The return value of this method is not covered by the Salient toolkit's
204: * backward compatibility promise.
205: */
206: public static function normalise(string $string): string
207: {
208: // 1. Replace "&" with " and "
209: // 2. Remove "."
210: // 3. Replace non-alphanumeric character sequences with " "
211: // 4. Remove leading and trailing whitespace
212: // 5. Convert ASCII characters to uppercase
213: return self::upper(trim(Regex::replace([
214: '/([[:alnum:]][^&]*+)&(?=[^&[:alnum:]]*+[[:alnum:]])/u',
215: '/\.++/',
216: '/[^[:alnum:]]++/u',
217: ], [
218: '$1 and ',
219: '',
220: ' ',
221: ], $string)));
222: }
223:
224: /**
225: * Replace the end of a string with an ellipsis ("...") if its length
226: * exceeds a limit
227: *
228: * @param int<3,max> $length
229: */
230: public static function ellipsize(string $value, int $length): string
231: {
232: if (mb_strlen($value) > $length) {
233: return rtrim(mb_substr($value, 0, $length - 3)) . '...';
234: }
235:
236: return $value;
237: }
238:
239: /**
240: * Apply an end-of-line sequence to a string
241: */
242: public static function setEol(string $string, string $eol = "\n"): string
243: {
244: switch ($eol) {
245: case "\n":
246: return str_replace(["\r\n", "\r"], $eol, $string);
247: case "\r":
248: return str_replace(["\r\n", "\n"], $eol, $string);
249: case "\r\n":
250: return str_replace(["\r\n", "\r", "\n"], ["\n", "\n", $eol], $string);
251: default:
252: return str_replace("\n", $eol, self::setEol($string));
253: }
254: }
255:
256: /**
257: * Remove native end-of-line sequences from the end of a string
258: */
259: public static function trimNativeEol(string $string): string
260: {
261: if (\PHP_EOL === "\n") {
262: $s = rtrim($string, "\n");
263: // Don't remove "\n" from "\r\n"
264: if ($s !== $string && $s !== '' && $s[-1] === "\r") {
265: return "$s\n";
266: }
267: return $s;
268: }
269:
270: $length = strlen(\PHP_EOL);
271: while (substr($string, -$length) === \PHP_EOL) {
272: $string = substr($string, 0, -$length);
273: }
274:
275: return $string;
276: }
277:
278: /**
279: * Replace line feed (LF) characters in a string with the native end-of-line
280: * sequence
281: */
282: public static function eolToNative(string $string): string
283: {
284: return \PHP_EOL === "\n"
285: ? $string
286: : str_replace("\n", \PHP_EOL, $string);
287: }
288:
289: /**
290: * Replace native end-of-line sequences in a string with the line feed (LF)
291: * character
292: */
293: public static function eolFromNative(string $string): string
294: {
295: return \PHP_EOL === "\n"
296: ? $string
297: : str_replace(\PHP_EOL, "\n", $string);
298: }
299:
300: /**
301: * Convert words in a string to snake_case, optionally preserving non-word
302: * characters
303: */
304: public static function snake(string $string, string $preserve = ''): string
305: {
306: return self::lower(self::words($string, '_', $preserve));
307: }
308:
309: /**
310: * Convert words in a string to kebab-case, optionally preserving non-word
311: * characters
312: */
313: public static function kebab(string $string, string $preserve = ''): string
314: {
315: return self::lower(self::words($string, '-', $preserve));
316: }
317:
318: /**
319: * Convert words in a string to camelCase, optionally preserving non-word
320: * characters
321: */
322: public static function camel(string $string, string $preserve = ''): string
323: {
324: return Regex::replaceCallback(
325: '/(?<![[:alnum:]])[[:alpha:]]/u',
326: fn($matches) => self::lower($matches[0]),
327: self::pascal($string, $preserve),
328: );
329: }
330:
331: /**
332: * Convert words in a string to PascalCase, optionally preserving non-word
333: * characters
334: */
335: public static function pascal(string $string, string $preserve = ''): string
336: {
337: return self::words($string, '', $preserve, fn($string) => self::upperFirst(self::lower($string)));
338: }
339:
340: /**
341: * Get words from a string and delimit them with a separator, optionally
342: * preserving non-word characters and applying a callback to each word
343: *
344: * A word consists of one or more letters of the same case, or one uppercase
345: * letter followed by zero or more lowercase letters. Numbers are treated as
346: * lowercase letters except that two or more uppercase letters form one word
347: * with any subsequent numbers.
348: *
349: * @param (Closure(string): string)|null $callback
350: */
351: public static function words(
352: string $string,
353: string $separator = ' ',
354: string $preserve = '',
355: ?Closure $callback = null
356: ): string {
357: $notAfterPreserve = '';
358: if (
359: $preserve !== ''
360: && ($preserve = Regex::replace('/[[:alnum:]]++/u', '', $preserve)) !== ''
361: ) {
362: $preserve = Regex::quoteCharacters($preserve, '/');
363: $preserve = "[:alnum:]{$preserve}";
364: // Prevent "key=value" becoming "key= value" when preserving "=" by
365: // asserting that when separating words, they must appear:
366: // - immediately after the previous word (\G),
367: // - after an unpreserved character, or
368: // - at a word boundary (e.g. "Value" in "key=someValue")
369: if ($separator !== '') {
370: $notAfterPreserve = '(?:\G'
371: . "|(?<=[^{$preserve}])"
372: . '|(?<=[[:lower:][:digit:]])(?=[[:upper:]]))';
373: }
374: } else {
375: $preserve = '[:alnum:]';
376: }
377: $word = '(?:[[:upper:]]?[[:lower:][:digit:]]++'
378: . '|(?:[[:upper:]](?![[:lower:]]))++[[:digit:]]*+)';
379:
380: // Insert separators before words to prevent "foo bar" becoming "foobar"
381: if ($separator !== '') {
382: if (Regex::match("/[{$preserve}]/u", $separator)) {
383: throw new InvalidArgumentException('Invalid separator (preserved characters cannot be used)');
384: }
385: $separator = Regex::quoteReplacement($separator);
386: $string = Regex::replace(
387: "/$notAfterPreserve$word/u",
388: $separator . '$0',
389: $string,
390: );
391: }
392:
393: if ($callback !== null) {
394: $string = Regex::replaceCallback(
395: "/$word/u",
396: fn($matches) => $callback($matches[0]),
397: $string,
398: );
399: }
400:
401: // Trim unpreserved characters from the beginning and end of the string,
402: // then replace sequences of them with one separator
403: return Regex::replace([
404: "/^[^{$preserve}]++|[^{$preserve}]++\$/uD",
405: "/[^{$preserve}]++/u",
406: ], [
407: '',
408: $separator,
409: ], $string);
410: }
411:
412: /**
413: * Expand tabs in a string to spaces
414: *
415: * @param int<1,max> $tabSize
416: * @param int $column The starting column (1-based) of `$text`.
417: */
418: public static function expandTabs(
419: string $string,
420: int $tabSize = 8,
421: int $column = 1
422: ): string {
423: if (strpos($string, "\t") === false) {
424: return $string;
425: }
426: $lines = Regex::split('/(\r\n|\n|\r)/', $string, -1, \PREG_SPLIT_DELIM_CAPTURE);
427: $lines[] = '';
428: $expanded = '';
429: foreach (array_chunk($lines, 2) as [$line, $eol]) {
430: $parts = explode("\t", $line);
431: $last = array_key_last($parts);
432: foreach ($parts as $i => $part) {
433: $expanded .= $part;
434: if ($i === $last) {
435: $expanded .= $eol;
436: break;
437: }
438: $column += mb_strlen($part);
439: // e.g. with $tabSize 4, a tab at $column 2 occupies 3 spaces
440: $spaces = $tabSize - (($column - 1) % $tabSize);
441: $expanded .= str_repeat(' ', $spaces);
442: $column += $spaces;
443: }
444: $column = 1;
445: }
446: return $expanded;
447: }
448:
449: /**
450: * Expand leading tabs in a string to spaces
451: *
452: * @param int<1,max> $tabSize
453: * @param bool $preserveLine1 If `true`, tabs in the first line of `$text`
454: * are not expanded.
455: * @param int $column The starting column (1-based) of `$text`.
456: */
457: public static function expandLeadingTabs(
458: string $string,
459: int $tabSize = 8,
460: bool $preserveLine1 = false,
461: int $column = 1
462: ): string {
463: if (strpos($string, "\t") === false) {
464: return $string;
465: }
466: $lines = Regex::split('/(\r\n|\n|\r)/', $string, -1, \PREG_SPLIT_DELIM_CAPTURE);
467: $lines[] = '';
468: $expanded = '';
469: foreach (array_chunk($lines, 2) as $i => [$line, $eol]) {
470: if (!$i && $preserveLine1) {
471: $expanded .= $line . $eol;
472: $column = 1;
473: continue;
474: }
475: $parts = explode("\t", $line);
476: do {
477: $part = array_shift($parts);
478: $expanded .= $part;
479: if (!$parts) {
480: $expanded .= $eol;
481: break;
482: }
483: if ($part !== '' && trim($part, ' ') !== '') {
484: $expanded .= "\t" . implode("\t", $parts) . $eol;
485: break;
486: }
487: $column += mb_strlen($part);
488: $spaces = $tabSize - (($column - 1) % $tabSize);
489: $expanded .= str_repeat(' ', $spaces);
490: $column += $spaces;
491: } while (true);
492: $column = 1;
493: }
494: return $expanded;
495: }
496:
497: /**
498: * Decode data encoded with base32
499: *
500: * @param bool $strict If `true`, throw an exception if any characters in
501: * `$string` are not in the \[RFC4648] "base32" alphabet after removing
502: * padding characters ('=') and converting ASCII letters to uppercase:
503: *
504: * ```
505: * ABCDEFGHIJKLMNOPQRSTUVWXYZ234567
506: * ```
507: *
508: * Otherwise, discard invalid characters.
509: */
510: public static function decodeBase32(string $string, bool $strict = false): string
511: {
512: $string = self::upper(rtrim($string, '='));
513:
514: // Handle different `str_split('')` behaviour between PHP 8.2+ and
515: // earlier versions
516: if ($string === '') {
517: return '';
518: }
519:
520: $bytes = '';
521: $currentByte = 0;
522: $currentBits = 0;
523: foreach (str_split($string) as $character) {
524: $value = self::BASE32_INDEX[$character] ?? null;
525: if ($value === null) {
526: if ($strict) {
527: throw new InvalidArgumentException(
528: sprintf('Character not in base32 alphabet: %s', $character),
529: );
530: }
531: continue;
532: }
533: // `$value` won't complete a byte without 3 or more existing bits
534: if ($currentBits < 3) {
535: $currentByte <<= 5;
536: $currentByte += $value;
537: $currentBits += 5;
538: } else {
539: $useBits = 8 - $currentBits;
540: $carryBits = 5 - $useBits;
541: $currentByte <<= $useBits;
542: $currentByte += $value >> $carryBits;
543: $bytes .= chr($currentByte);
544: $currentByte = $value & ((1 << $carryBits) - 1);
545: $currentBits = $carryBits;
546: }
547: }
548: return $bytes;
549: }
550:
551: /**
552: * Copy a string to a php://temp stream
553: *
554: * @return resource
555: */
556: public static function toStream(string $string)
557: {
558: $stream = File::open('php://temp', 'r+');
559: File::writeAll($stream, $string);
560: File::rewind($stream);
561: return $stream;
562: }
563:
564: /**
565: * Split a string by a string, trim substrings and remove any empty strings
566: *
567: * @param non-empty-string $separator
568: * @param int|null $limit The maximum number of substrings to return.
569: * Implies `$removeEmpty = false` if not `null`.
570: * @param string|null $characters Characters to trim, `null` (the default)
571: * to trim whitespace, or an empty string to trim nothing.
572: * @return ($limit is null ? ($removeEmpty is true ? list<string> : non-empty-list<string>) : non-empty-list<string>)
573: */
574: public static function split(
575: string $separator,
576: string $string,
577: ?int $limit = null,
578: bool $removeEmpty = true,
579: ?string $characters = null
580: ): array {
581: if ($limit !== null) {
582: $removeEmpty = false;
583: }
584: $split = explode($separator, $string, $limit ?? \PHP_INT_MAX);
585: $split = Arr::trim($split, $characters, $removeEmpty);
586: return $removeEmpty ? $split : array_values($split);
587: }
588:
589: /**
590: * Split a string by a string without splitting bracket-delimited or
591: * double-quoted substrings, trim substrings and remove any empty strings
592: *
593: * @param non-empty-string $separator
594: * @param string|null $characters Characters to trim, `null` (the default)
595: * to trim whitespace, or an empty string to trim nothing.
596: * @param int-mask-of<Str::PRESERVE_*> $flags
597: * @return ($removeEmpty is true ? list<string> : non-empty-list<string>)
598: */
599: public static function splitDelimited(
600: string $separator,
601: string $string,
602: bool $removeEmpty = true,
603: ?string $characters = null,
604: int $flags = Str::PRESERVE_DOUBLE_QUOTED
605: ): array {
606: if (strlen($separator) !== 1) {
607: throw new InvalidArgumentException('Separator must be a single character');
608: }
609:
610: $quotes = '';
611: $regex = '';
612: if ($flags & self::PRESERVE_DOUBLE_QUOTED) {
613: $quotes .= '"';
614: $regex .= ' | " (?: [^"\\\\] | \\\\ . )*+ "';
615: }
616: if ($flags & self::PRESERVE_SINGLE_QUOTED) {
617: $quotes .= "'";
618: $regex .= " | ' (?: [^'\\\\] | \\\\ . )*+ '";
619: }
620:
621: if (strpos('()<>[]{}' . $quotes, $separator) !== false) {
622: throw new InvalidArgumentException('Separator cannot be a delimiter');
623: }
624:
625: $quoted = Regex::quote($separator, '/');
626: $escaped = Regex::quoteCharacters($separator, '/');
627: $regex = <<<REGEX
628: (?x)
629: (?: [^{$quotes}()<>[\]{}{$escaped}]++ |
630: ( \( (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ \) |
631: < (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ > |
632: \[ (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ \] |
633: \{ (?: [^{$quotes}()<>[\]{}]*+ (?-1)? )*+ \}{$regex} ) |
634: # Match empty substrings
635: (?<= $quoted | ^ ) (?= $quoted | \$ ) )+
636: REGEX;
637: $regex = Regex::delimit($regex, '/');
638: Regex::matchAll($regex, $string, $matches);
639: $split = Arr::trim($matches[0], $characters, $removeEmpty);
640:
641: // @phpstan-ignore return.type
642: return $removeEmpty ? $split : array_values($split);
643: }
644:
645: /**
646: * Wrap a string to a given number of characters, optionally varying the
647: * width of the first line
648: *
649: * @param int|array{int,int} $width The number of characters at which the
650: * string will be wrapped, or `[ <first_line_width>, <width> ]`.
651: */
652: public static function wrap(
653: string $string,
654: $width = 75,
655: string $break = "\n",
656: bool $cutLongWords = false
657: ): string {
658: [$delta, $width] = is_array($width)
659: ? [$width[1] - $width[0], $width[1]]
660: : [0, $width];
661:
662: return !$delta
663: ? wordwrap($string, $width, $break, $cutLongWords)
664: : ($delta < 0
665: // For hanging indents, remove and restore $delta characters
666: ? substr($string, 0, -$delta)
667: . wordwrap(substr($string, -$delta), $width, $break, $cutLongWords)
668: // For first line indents, add and remove $delta characters
669: : substr(
670: wordwrap(str_repeat('x', $delta) . $string, $width, $break, $cutLongWords),
671: $delta,
672: ));
673: }
674:
675: /**
676: * Undo wordwrap(), preserving Markdown-style paragraphs and lists
677: *
678: * Non-consecutive line breaks are converted to spaces except before:
679: *
680: * - four or more spaces
681: * - one or more tabs
682: * - Markdown-style list items (e.g. `- item`, `1. item`)
683: *
684: * @param bool $ignoreEscapes If `false`, preserve escaped whitespace.
685: * @param bool $trimLines If `true`, remove whitespace from the end of each
686: * line and between unwrapped lines.
687: * @param bool $collapseBlankLines If `true`, collapse three or more
688: * subsequent line breaks to two.
689: */
690: public static function unwrap(
691: string $string,
692: string $break = "\n",
693: bool $ignoreEscapes = true,
694: bool $trimLines = false,
695: bool $collapseBlankLines = false
696: ): string {
697: $newline = Regex::quote($break, '/');
698: $noEscape = $ignoreEscapes ? '' : '(?<!\\\\)(?:\\\\\\\\)*\K';
699:
700: if ($trimLines) {
701: $search[] = "/{$noEscape}\h+({$newline})/";
702: $replace[] = '$1';
703: $between = '\h*';
704: } else {
705: $between = '';
706: }
707:
708: $search[] = "/{$noEscape}(?<!{$newline}|^){$newline}(?!{$newline}|\$| |\\t|(?:[-+*]|[0-9]+[).])\h){$between}/D";
709: $replace[] = ' ';
710:
711: if ($collapseBlankLines) {
712: $search[] = "/(?:{$newline}){3,}/";
713: $replace[] = $break . $break;
714: }
715:
716: return Regex::replace($search, $replace, $string);
717: }
718:
719: /**
720: * Replace whitespace character sequences in a string with a single space
721: */
722: public static function collapse(string $string): string
723: {
724: return Regex::replace('/\s++/', ' ', $string);
725: }
726:
727: /**
728: * Enclose a string between delimiters
729: *
730: * @param string|null $after If `null`, `$before` is used before and after
731: * the string.
732: */
733: public static function enclose(string $string, string $before, ?string $after = null): string
734: {
735: return $before . $string . ($after ?? $before);
736: }
737:
738: /**
739: * Get the Levenshtein distance between two strings relative to the length
740: * of the longest string
741: *
742: * @return float A value between `0` and `1`, where `0` means the strings
743: * are identical, and `1` means they have no similarities.
744: */
745: public static function distance(
746: string $string1,
747: string $string2,
748: bool $normalise = false
749: ): float {
750: if ($normalise) {
751: $string1 = self::normalise($string1);
752: $string2 = self::normalise($string2);
753: }
754:
755: if ($string1 === '' && $string2 === '') {
756: return 0.0;
757: }
758:
759: return levenshtein($string1, $string2)
760: / max(strlen($string1), strlen($string2));
761: }
762:
763: /**
764: * Get the similarity of two strings relative to the length of the longest
765: * string
766: *
767: * @return float A value between `0` and `1`, where `0` means the strings
768: * have no similarities, and `1` means they are identical.
769: */
770: public static function similarity(
771: string $string1,
772: string $string2,
773: bool $normalise = false
774: ): float {
775: if ($normalise) {
776: $string1 = self::normalise($string1);
777: $string2 = self::normalise($string2);
778: }
779:
780: if ($string1 === '' && $string2 === '') {
781: return 1.0;
782: }
783:
784: return max(
785: similar_text($string1, $string2),
786: similar_text($string2, $string1),
787: ) / max(strlen($string1), strlen($string2));
788: }
789:
790: /**
791: * Get ngrams shared between two strings relative to the number of ngrams in
792: * the longest string
793: *
794: * @return float A value between `0` and `1`, where `0` means the strings
795: * have no shared ngrams, and `1` means their ngrams are identical.
796: */
797: public static function ngramSimilarity(
798: string $string1,
799: string $string2,
800: bool $normalise = false,
801: int $size = 2
802: ): float {
803: return self::ngramScore(true, $string1, $string2, $normalise, $size);
804: }
805:
806: /**
807: * Get ngrams shared between two strings relative to the number of ngrams in
808: * the shortest string
809: *
810: * @return float A value between `0` and `1`, where `0` means the strings
811: * have no shared ngrams, and `1` means their ngrams are identical.
812: */
813: public static function ngramIntersection(
814: string $string1,
815: string $string2,
816: bool $normalise = false,
817: int $size = 2
818: ): float {
819: return self::ngramScore(false, $string1, $string2, $normalise, $size);
820: }
821:
822: private static function ngramScore(
823: bool $relativeToLongest,
824: string $string1,
825: string $string2,
826: bool $normalise,
827: int $size
828: ): float {
829: if ($normalise) {
830: $string1 = self::normalise($string1);
831: $string2 = self::normalise($string2);
832: }
833:
834: if (strlen($string1) < $size && strlen($string2) < $size) {
835: return 1.0;
836: }
837:
838: $ngrams1 = self::ngrams($string1, $size);
839: $ngrams2 = self::ngrams($string2, $size);
840: $count = $relativeToLongest
841: ? max(count($ngrams1), count($ngrams2))
842: : min(count($ngrams1), count($ngrams2));
843:
844: $same = 0;
845: foreach ($ngrams1 as $ngram) {
846: $key = array_search($ngram, $ngrams2, true);
847: if ($key !== false) {
848: $same++;
849: unset($ngrams2[$key]);
850: }
851: }
852:
853: return $same / $count;
854: }
855:
856: /**
857: * Get a string's n-grams
858: *
859: * @return string[]
860: */
861: public static function ngrams(string $string, int $size = 2): array
862: {
863: if (strlen($string) < $size) {
864: return [];
865: }
866:
867: $ngrams = [];
868: for ($i = 0; $i < $size; $i++) {
869: $split = $i
870: ? substr($string, $i)
871: : $string;
872: $trim = strlen($split) % $size;
873: if ($trim) {
874: $split = substr($split, 0, -$trim);
875: }
876: if ($split === '') {
877: continue;
878: }
879: /** @var string[] */
880: $split = str_split($split, $size);
881: $ngrams = array_merge($ngrams, $split);
882: }
883:
884: return $ngrams;
885: }
886:
887: /**
888: * Group lists in a string by heading and remove duplicate items
889: *
890: * - Lines in `$text` are processed in order, from first to last
891: * - If a non-empty line matches `$itemRegex`, it is treated as a list item,
892: * otherwise it becomes the current heading
893: * - The current heading is cleared when an empty line is encountered after
894: * a list item (unless `$loose` is `true`)
895: * - Top-level lines (headings with no items, and items with no heading) are
896: * returned before lists with headings
897: * - If `$itemRegex` has a named subpattern called `indent` that matches a
898: * non-empty string, subsequent lines with indentation of the same width
899: * are treated as a continuation of the item, along with any empty lines
900: * between them
901: *
902: * @param string $listSeparator Inserted between headings and lists.
903: * @param string|null $headingPrefix Inserted before headings, e.g. `"-"`.
904: * Indentation of the same width is applied to subsequent list items.
905: * @param bool $clean If `true`, remove the first match of `$itemRegex` from
906: * the beginning of each item with no heading.
907: * @param bool $loose If `true`, do not clear the current heading when an
908: * empty line is encountered.
909: * @param bool $discardEmpty If `true`, discard headings with no items.
910: * @param int<1,max> $tabSize
911: */
912: public static function mergeLists(
913: string $string,
914: string $listSeparator = "\n",
915: ?string $headingPrefix = null,
916: ?string $itemRegex = Str::DEFAULT_ITEM_REGEX,
917: bool $clean = false,
918: bool $loose = false,
919: bool $discardEmpty = false,
920: string $eol = "\n",
921: int $tabSize = 4
922: ): string {
923: return (new ListMerger(
924: $listSeparator,
925: self::coalesce($headingPrefix, null),
926: $itemRegex ?? self::DEFAULT_ITEM_REGEX,
927: $clean,
928: $loose,
929: $discardEmpty,
930: $eol,
931: $tabSize,
932: ))->merge($string);
933: }
934: }
935: