File Toolkit/Utility/Str.php | salient/toolkit API

1:	<?php declare(strict_types=1);
2:
3:	namespace Salient\Utility;
4:
5:	use Salient\Utility\Internal\ListMerger;
6:	use Closure;
7:	use InvalidArgumentException;
8:	use Stringable;
9:
10:	/**
11:	* Work with strings
12:	*
13:	* @api
14:	*/
15:	final class Str extends AbstractUtility
16:	{
17:	public const ALPHANUMERIC = Str::ALPHA . Str::NUMERIC;
18:	public const ALPHA = Str::LOWER . Str::UPPER;
19:	public const LOWER = 'abcdefghijklmnopqrstuvwxyz';
20:	public const UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
21:	public const NUMERIC = '0123456789';
22:	public const HEX = '0123456789abcdefABCDEF';
23:	public const PRESERVE_DOUBLE_QUOTED = 1;
24:	public const PRESERVE_SINGLE_QUOTED = 2;
25:	public const PRESERVE_QUOTED = Str::PRESERVE_DOUBLE_QUOTED \| Str::PRESERVE_SINGLE_QUOTED;
26:
27:	public const ASCII_EXTENDED =
28:	"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
29:	. "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
30:	. "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
31:	. "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
32:	. "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
33:	. "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
34:	. "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
35:	. "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff";
36:
37:	/**
38:	* Default value of mergeLists() parameter $itemRegex
39:	*/
40:	public const DEFAULT_ITEM_REGEX = '/^(?<indent>\h[-] )/';
41:
42:	/**
43:	* Get the first string that is not null or empty, or return the last value
44:	*
45:	* @param int\|float\|string\|bool\|Stringable\|null ...$strings
46:	*/
47:	public static function coalesce(...$strings): ?string
48:	{
49:	$string = null;
50:	foreach ($strings as $string) {
51:	if ($string !== null) {
52:	$string = (string) $string;
53:	if ($string !== '') {
54:	return $string;
55:	}
56:	}
57:	}
58:	return $string;
59:	}
60:
61:	/**
62:	* Convert ASCII letters in a string to lowercase
63:	*/
64:	public static function lower(string $string): string
65:	{
66:	return strtr($string, self::UPPER, self::LOWER);
67:	}
68:
69:	/**
70:	* Convert ASCII letters in a string to uppercase
71:	*/
72:	public static function upper(string $string): string
73:	{
74:	return strtr($string, self::LOWER, self::UPPER);
75:	}
76:
77:	/**
78:	* Make the first character in a string uppercase if it is an ASCII letter
79:	*/
80:	public static function upperFirst(string $string): string
81:	{
82:	if ($string !== '') {
83:	$string[0] = self::upper($string[0]);
84:	}
85:	return $string;
86:	}
87:
88:	/**
89:	* Match a string's case to another string
90:	*/
91:	public static function matchCase(string $string, string $match): string
92:	{
93:	$match = trim($match);
94:
95:	if ($match === '') {
96:	return $string;
97:	}
98:
99:	$upper = strpbrk($match, self::UPPER);
100:	$hasUpper = $upper !== false;
101:	$hasLower = strpbrk($match, self::LOWER) !== false;
102:
103:	if (strlen($match) === 1) {
104:	return $hasLower
105:	? self::lower($string)
106:	: ($hasUpper
107:	? self::upperFirst(self::lower($string))
108:	: $string);
109:	}
110:
111:	if ($hasUpper && !$hasLower) {
112:	return self::upper($string);
113:	}
114:
115:	if (!$hasUpper && $hasLower) {
116:	return self::lower($string);
117:	}
118:
119:	// Do nothing if there are no letters, or if there is a mix of cases and
120:	// the first letter is not uppercase
121:	if ((!$hasUpper && !$hasLower) \|\| $upper !== $match) {
122:	return $string;
123:	}
124:
125:	return self::upperFirst(self::lower($string));
126:	}
127:
128:	/**
129:	* Check if a string starts with a given substring
130:	*
131:	* @param iterable<string>\|string $needles
132:	*/
133:	public static function startsWith(string $haystack, $needles, bool $ignoreCase = false): bool
134:	{
135:	if (!is_iterable($needles)) {
136:	$needles = [$needles];
137:	}
138:	if ($ignoreCase) {
139:	$haystack = self::lower($haystack);
140:	$needles = Arr::lower($needles);
141:	}
142:	foreach ($needles as $needle) {
143:	if ($needle !== '' && substr($haystack, 0, strlen($needle)) === $needle) {
144:	return true;
145:	}
146:	}
147:	return false;
148:	}
149:
150:	/**
151:	* Check if a string ends with a given substring
152:	*
153:	* @param iterable<string>\|string $needles
154:	*/
155:	public static function endsWith(string $haystack, $needles, bool $ignoreCase = false): bool
156:	{
157:	if (!is_iterable($needles)) {
158:	$needles = [$needles];
159:	}
160:	if ($ignoreCase) {
161:	$haystack = self::lower($haystack);
162:	$needles = Arr::lower($needles);
163:	}
164:	foreach ($needles as $needle) {
165:	if ($needle !== '' && substr($haystack, -strlen($needle)) === $needle) {
166:	return true;
167:	}
168:	}
169:	return false;
170:	}
171:
172:	/**
173:	* Check if every character in a string has a codepoint between 0 and 127
174:	*/
175:	public static function isAscii(string $string): bool
176:	{
177:	return strcspn($string, self::ASCII_EXTENDED) === strlen($string);
178:	}
179:
180:	/**
181:	* Escape special characters in a string for use in Markdown
182:	*/
183:	public static function escapeMarkdown(string $string): string
184:	{
185:	return Regex::replace(
186:	<<<'REGEX'
187:	/ [*<[\\`\|] \|
188:	(?<= [\h[:punct:]] (?: (?<! _ ) \| (?<= \G ) ) \| ^ ) _ \|
189:	_ (?= _*+ (?: [\h[:punct:]] \| $ \| \R ) ) \|
190:	(?<! ~ ) ~ (?= ~ (?! ~ ) ) \|
191:	^ \h* \K (?: > \| ~ (?= ~~+ ) \| (?: \# {1,6} \| [+-] \| [0-9]+ \K \. ) (?= \h ) ) /mx
192:	REGEX,
193:	'\\\\$0',
194:	$string,
195:	);
196:	}
197:
198:	/**
199:	* Normalise a string for comparison
200:	*
201:	* The return value of this method is not covered by the Salient toolkit's
202:	* backward compatibility promise.
203:	*/
204:	public static function normalise(string $string): string
205:	{
206:	// 1. Replace "&" with " and "
207:	// 2. Remove "."
208:	// 3. Replace non-alphanumeric character sequences with " "
209:	// 4. Remove leading and trailing whitespace
210:	// 5. Convert ASCII characters to uppercase
211:	return self::upper(trim(Regex::replace([
212:	'/([[:alnum:]][^&]+)&(?=[^&[:alnum:]]+[[:alnum:]])/u',
213:	'/\.++/',
214:	'/[^[:alnum:]]++/u',
215:	], [
216:	'$1 and ',
217:	'',
218:	' ',
219:	], $string)));
220:	}
221:
222:	/**
223:	* Replace the end of a string with an ellipsis ("...") if its length
224:	* exceeds a limit
225:	*
226:	* @param int<3,max> $length
227:	*/
228:	public static function ellipsize(string $value, int $length): string
229:	{
230:	if (mb_strlen($value) > $length) {
231:	return rtrim(mb_substr($value, 0, $length - 3)) . '...';
232:	}
233:
234:	return $value;
235:	}
236:
237:	/**
238:	* Apply an end-of-line sequence to a string
239:	*/
240:	public static function setEol(string $string, string $eol = "\n"): string
241:	{
242:	switch ($eol) {
243:	case "\n":
244:	return str_replace(["\r\n", "\r"], $eol, $string);
245:	case "\r":
246:	return str_replace(["\r\n", "\n"], $eol, $string);
247:	case "\r\n":
248:	return str_replace(["\r\n", "\r", "\n"], ["\n", "\n", $eol], $string);
249:	default:
250:	return str_replace("\n", $eol, self::setEol($string));
251:	}
252:	}
253:
254:	/**
255:	* Remove native end-of-line sequences from the end of a string
256:	*/
257:	public static function trimNativeEol(string $string): string
258:	{
259:	if (\PHP_EOL === "\n") {
260:	$s = rtrim($string, "\n");
261:	// Don't remove "\n" from "\r\n"
262:	if ($s !== $string && $s !== '' && $s[-1] === "\r") {
263:	return "$s\n";
264:	}
265:	return $s;
266:	}
267:
268:	$length = strlen(\PHP_EOL);
269:	while (substr($string, -$length) === \PHP_EOL) {
270:	$string = substr($string, 0, -$length);
271:	}
272:
273:	return $string;
274:	}
275:
276:	/**
277:	* Replace line feed (LF) characters in a string with the native end-of-line
278:	* sequence
279:	*/
280:	public static function eolToNative(string $string): string
281:	{
282:	return \PHP_EOL === "\n"
283:	? $string
284:	: str_replace("\n", \PHP_EOL, $string);
285:	}
286:
287:	/**
288:	* Replace native end-of-line sequences in a string with the line feed (LF)
289:	* character
290:	*/
291:	public static function eolFromNative(string $string): string
292:	{
293:	return \PHP_EOL === "\n"
294:	? $string
295:	: str_replace(\PHP_EOL, "\n", $string);
296:	}
297:
298:	/**
299:	* Convert words in a string to snake_case, optionally preserving non-word
300:	* characters
301:	*/
302:	public static function snake(string $string, string $preserve = ''): string
303:	{
304:	return self::lower(self::words($string, '_', $preserve));
305:	}
306:
307:	/**
308:	* Convert words in a string to kebab-case, optionally preserving non-word
309:	* characters
310:	*/
311:	public static function kebab(string $string, string $preserve = ''): string
312:	{
313:	return self::lower(self::words($string, '-', $preserve));
314:	}
315:
316:	/**
317:	* Convert words in a string to camelCase, optionally preserving non-word
318:	* characters
319:	*/
320:	public static function camel(string $string, string $preserve = ''): string
321:	{
322:	return Regex::replaceCallback(
323:	'/(?<![[:alnum:]])[[:alpha:]]/u',
324:	fn($matches) => self::lower($matches[0]),
325:	self::pascal($string, $preserve),
326:	);
327:	}
328:
329:	/**
330:	* Convert words in a string to PascalCase, optionally preserving non-word
331:	* characters
332:	*/
333:	public static function pascal(string $string, string $preserve = ''): string
334:	{
335:	return self::words($string, '', $preserve, fn($string) => self::upperFirst(self::lower($string)));
336:	}
337:
338:	/**
339:	* Get words from a string and delimit them with a separator, optionally
340:	* preserving non-word characters and applying a callback to each word
341:	*
342:	* A word consists of one or more letters of the same case, or one uppercase
343:	* letter followed by zero or more lowercase letters. Numbers are treated as
344:	* lowercase letters except that two or more uppercase letters form one word
345:	* with any subsequent numbers.
346:	*
347:	* @param (Closure(string): string)\|null $callback
348:	*/
349:	public static function words(
350:	string $string,
351:	string $separator = ' ',
352:	string $preserve = '',
353:	?Closure $callback = null
354:	): string {
355:	$notAfterPreserve = '';
356:	if (
357:	$preserve !== ''
358:	&& ($preserve = Regex::replace('/[[:alnum:]]++/u', '', $preserve)) !== ''
359:	) {
360:	$preserve = Regex::quoteCharacters($preserve, '/');
361:	$preserve = "[:alnum:]{$preserve}";
362:	// Prevent "key=value" becoming "key= value" when preserving "=" by
363:	// asserting that when separating words, they must appear:
364:	// - immediately after the previous word (\G),
365:	// - after an unpreserved character, or
366:	// - at a word boundary (e.g. "Value" in "key=someValue")
367:	if ($separator !== '') {
368:	$notAfterPreserve = '(?:\G'
369:	. "\|(?<=[^{$preserve}])"
370:	. '\|(?<=[[:lower:][:digit:]])(?=[[:upper:]]))';
371:	}
372:	} else {
373:	$preserve = '[:alnum:]';
374:	}
375:	$word = '(?:[[:upper:]]?[[:lower:][:digit:]]++'
376:	. '\|(?:[[:upper:]](?![[:lower:]]))++[[:digit:]]*+)';
377:
378:	// Insert separators before words to prevent "foo bar" becoming "foobar"
379:	if ($separator !== '') {
380:	if (Regex::match("/[{$preserve}]/u", $separator)) {
381:	throw new InvalidArgumentException('Invalid separator (preserved characters cannot be used)');
382:	}
383:	$separator = Regex::quoteReplacement($separator);
384:	$string = Regex::replace(
385:	"/$notAfterPreserve$word/u",
386:	$separator . '$0',
387:	$string,
388:	);
389:	}
390:
391:	if ($callback !== null) {
392:	$string = Regex::replaceCallback(
393:	"/$word/u",
394:	fn($matches) => $callback($matches[0]),
395:	$string,
396:	);
397:	}
398:
399:	// Trim unpreserved characters from the beginning and end of the string,
400:	// then replace sequences of them with one separator
401:	return Regex::replace([
402:	"/^[^{$preserve}]++\|[^{$preserve}]++\$/uD",
403:	"/[^{$preserve}]++/u",
404:	], [
405:	'',
406:	$separator,
407:	], $string);
408:	}
409:
410:	/**
411:	* Expand tabs in a string to spaces
412:	*
413:	* @param int<1,max> $tabSize
414:	* @param int $column The starting column (1-based) of `$text`.
415:	*/
416:	public static function expandTabs(
417:	string $string,
418:	int $tabSize = 8,
419:	int $column = 1
420:	): string {
421:	if (strpos($string, "\t") === false) {
422:	return $string;
423:	}
424:	$lines = Regex::split('/(\r\n\|\n\|\r)/', $string, -1, \PREG_SPLIT_DELIM_CAPTURE);
425:	$lines[] = '';
426:	$expanded = '';
427:	foreach (array_chunk($lines, 2) as [$line, $eol]) {
428:	$parts = explode("\t", $line);
429:	$last = array_key_last($parts);
430:	foreach ($parts as $i => $part) {
431:	$expanded .= $part;
432:	if ($i === $last) {
433:	$expanded .= $eol;
434:	break;
435:	}
436:	$column += mb_strlen($part);
437:	// e.g. with $tabSize 4, a tab at $column 2 occupies 3 spaces
438:	$spaces = $tabSize - (($column - 1) % $tabSize);
439:	$expanded .= str_repeat(' ', $spaces);
440:	$column += $spaces;
441:	}
442:	$column = 1;
443:	}
444:	return $expanded;
445:	}
446:
447:	/**
448:	* Expand leading tabs in a string to spaces
449:	*
450:	* @param int<1,max> $tabSize
451:	* @param bool $preserveLine1 If `true`, tabs in the first line of `$text`
452:	* are not expanded.
453:	* @param int $column The starting column (1-based) of `$text`.
454:	*/
455:	public static function expandLeadingTabs(
456:	string $string,
457:	int $tabSize = 8,
458:	bool $preserveLine1 = false,
459:	int $column = 1
460:	): string {
461:	if (strpos($string, "\t") === false) {
462:	return $string;
463:	}
464:	$lines = Regex::split('/(\r\n\|\n\|\r)/', $string, -1, \PREG_SPLIT_DELIM_CAPTURE);
465:	$lines[] = '';
466:	$expanded = '';
467:	foreach (array_chunk($lines, 2) as $i => [$line, $eol]) {
468:	if (!$i && $preserveLine1) {
469:	$expanded .= $line . $eol;
470:	$column = 1;
471:	continue;
472:	}
473:	$parts = explode("\t", $line);
474:	do {
475:	$part = array_shift($parts);
476:	$expanded .= $part;
477:	if (!$parts) {
478:	$expanded .= $eol;
479:	break;
480:	}
481:	if ($part !== '' && trim($part, ' ') !== '') {
482:	$expanded .= "\t" . implode("\t", $parts) . $eol;
483:	break;
484:	}
485:	$column += mb_strlen($part);
486:	$spaces = $tabSize - (($column - 1) % $tabSize);
487:	$expanded .= str_repeat(' ', $spaces);
488:	$column += $spaces;
489:	} while (true);
490:	$column = 1;
491:	}
492:	return $expanded;
493:	}
494:
495:	/**
496:	* Copy a string to a php://temp stream
497:	*
498:	* @return resource
499:	*/
500:	public static function toStream(string $string)
501:	{
502:	$stream = File::open('php://temp', 'r+');
503:	File::writeAll($stream, $string);
504:	File::rewind($stream);
505:	return $stream;
506:	}
507:
508:	/**
509:	* Split a string by a string, trim substrings and remove any empty strings
510:	*
511:	* @param non-empty-string $separator
512:	* @param int\|null $limit The maximum number of substrings to return.
513:	* Implies `$removeEmpty = false` if not `null`.
514:	* @param string\|null $characters Characters to trim, `null` (the default)
515:	* to trim whitespace, or an empty string to trim nothing.
516:	* @return ($limit is null ? ($removeEmpty is true ? list<string> : non-empty-list<string>) : non-empty-list<string>)
517:	*/
518:	public static function split(
519:	string $separator,
520:	string $string,
521:	?int $limit = null,
522:	bool $removeEmpty = true,
523:	?string $characters = null
524:	): array {
525:	if ($limit !== null) {
526:	$removeEmpty = false;
527:	}
528:	$split = explode($separator, $string, $limit ?? \PHP_INT_MAX);
529:	$split = Arr::trim($split, $characters, $removeEmpty);
530:	return $removeEmpty ? $split : array_values($split);
531:	}
532:
533:	/**
534:	* Split a string by a string without splitting bracket-delimited or
535:	* double-quoted substrings, trim substrings and remove any empty strings
536:	*
537:	* @param non-empty-string $separator
538:	* @param string\|null $characters Characters to trim, `null` (the default)
539:	* to trim whitespace, or an empty string to trim nothing.
540:	* @param int-mask-of<Str::PRESERVE_*> $flags
541:	* @return ($removeEmpty is true ? list<string> : non-empty-list<string>)
542:	*/
543:	public static function splitDelimited(
544:	string $separator,
545:	string $string,
546:	bool $removeEmpty = true,
547:	?string $characters = null,
548:	int $flags = Str::PRESERVE_DOUBLE_QUOTED
549:	): array {
550:	if (strlen($separator) !== 1) {
551:	throw new InvalidArgumentException('Separator must be a single character');
552:	}
553:
554:	$quotes = '';
555:	$regex = '';
556:	if ($flags & self::PRESERVE_DOUBLE_QUOTED) {
557:	$quotes .= '"';
558:	$regex .= ' \| " (?: [^"\\\\] \| \\\\ . )*+ "';
559:	}
560:	if ($flags & self::PRESERVE_SINGLE_QUOTED) {
561:	$quotes .= "'";
562:	$regex .= " \| ' (?: [^'\\\\] \| \\\\ . )*+ '";
563:	}
564:
565:	if (strpos('()<>[]{}' . $quotes, $separator) !== false) {
566:	throw new InvalidArgumentException('Separator cannot be a delimiter');
567:	}
568:
569:	$quoted = Regex::quote($separator, '/');
570:	$escaped = Regex::quoteCharacters($separator, '/');
571:	$regex = <<<REGEX
572:	(?x)
573:	(?: [^{$quotes}()<>[\]{}{$escaped}]++ \|
574:	( $ (?: [^{$quotes}()<>[\]{}]+ (?-1)? )+ $ \|
575:	< (?: [^{$quotes}()<>[\]{}]+ (?-1)? )+ > \|
576:	\[ (?: [^{$quotes}()<>[\]{}]+ (?-1)? )+ \] \|
577:	\{ (?: [^{$quotes}()<>[\]{}]+ (?-1)? )+ \}{$regex} ) \|
578:	# Match empty substrings
579:	(?<= $quoted \| ^ ) (?= $quoted \| \$ ) )+
580:	REGEX;
581:	$regex = Regex::delimit($regex, '/');
582:	Regex::matchAll($regex, $string, $matches);
583:	$split = Arr::trim($matches[0], $characters, $removeEmpty);
584:
585:	// @phpstan-ignore return.type
586:	return $removeEmpty ? $split : array_values($split);
587:	}
588:
589:	/**
590:	* Wrap a string to a given number of characters, optionally varying the
591:	* width of the first line
592:	*
593:	* @param int\|array{int,int} $width The number of characters at which the
594:	* string will be wrapped, or `[ <first_line_width>, <width> ]`.
595:	*/
596:	public static function wrap(
597:	string $string,
598:	$width = 75,
599:	string $break = "\n",
600:	bool $cutLongWords = false
601:	): string {
602:	[$delta, $width] = is_array($width)
603:	? [$width[1] - $width[0], $width[1]]
604:	: [0, $width];
605:
606:	return !$delta
607:	? wordwrap($string, $width, $break, $cutLongWords)
608:	: ($delta < 0
609:	// For hanging indents, remove and restore $delta characters
610:	? substr($string, 0, -$delta)
611:	. wordwrap(substr($string, -$delta), $width, $break, $cutLongWords)
612:	// For first line indents, add and remove $delta characters
613:	: substr(
614:	wordwrap(str_repeat('x', $delta) . $string, $width, $break, $cutLongWords),
615:	$delta,
616:	));
617:	}
618:
619:	/**
620:	* Undo wordwrap(), preserving Markdown-style paragraphs and lists
621:	*
622:	* Non-consecutive line breaks are converted to spaces except before:
623:	*
624:	* - four or more spaces
625:	* - one or more tabs
626:	* - Markdown-style list items (e.g. `- item`, `1. item`)
627:	*
628:	* @param bool $ignoreEscapes If `false`, preserve escaped whitespace.
629:	* @param bool $trimLines If `true`, remove whitespace from the end of each
630:	* line and between unwrapped lines.
631:	* @param bool $collapseBlankLines If `true`, collapse three or more
632:	* subsequent line breaks to two.
633:	*/
634:	public static function unwrap(
635:	string $string,
636:	string $break = "\n",
637:	bool $ignoreEscapes = true,
638:	bool $trimLines = false,
639:	bool $collapseBlankLines = false
640:	): string {
641:	$newline = Regex::quote($break, '/');
642:	$noEscape = $ignoreEscapes ? '' : '(?<!\\\\)(?:\\\\\\\\)*\K';
643:
644:	if ($trimLines) {
645:	$search[] = "/{$noEscape}\h+({$newline})/";
646:	$replace[] = '$1';
647:	$between = '\h*';
648:	} else {
649:	$between = '';
650:	}
651:
652:	$search[] = "/{$noEscape}(?<!{$newline}\|^){$newline}(?!{$newline}\|\$\| \|\\t\|(?:[-+*]\|[0-9]+[).])\h){$between}/D";
653:	$replace[] = ' ';
654:
655:	if ($collapseBlankLines) {
656:	$search[] = "/(?:{$newline}){3,}/";
657:	$replace[] = $break . $break;
658:	}
659:
660:	return Regex::replace($search, $replace, $string);
661:	}
662:
663:	/**
664:	* Replace whitespace character sequences in a string with a single space
665:	*/
666:	public static function collapse(string $string): string
667:	{
668:	return Regex::replace('/\s++/', ' ', $string);
669:	}
670:
671:	/**
672:	* Enclose a string between delimiters
673:	*
674:	* @param string\|null $after If `null`, `$before` is used before and after
675:	* the string.
676:	*/
677:	public static function enclose(string $string, string $before, ?string $after = null): string
678:	{
679:	return $before . $string . ($after ?? $before);
680:	}
681:
682:	/**
683:	* Get the Levenshtein distance between two strings relative to the length
684:	* of the longest string
685:	*
686:	* @return float A value between `0` and `1`, where `0` means the strings
687:	* are identical, and `1` means they have no similarities.
688:	*/
689:	public static function distance(
690:	string $string1,
691:	string $string2,
692:	bool $normalise = false
693:	): float {
694:	if ($normalise) {
695:	$string1 = self::normalise($string1);
696:	$string2 = self::normalise($string2);
697:	}
698:
699:	if ($string1 === '' && $string2 === '') {
700:	return 0.0;
701:	}
702:
703:	return levenshtein($string1, $string2)
704:	/ max(strlen($string1), strlen($string2));
705:	}
706:
707:	/**
708:	* Get the similarity of two strings relative to the length of the longest
709:	* string
710:	*
711:	* @return float A value between `0` and `1`, where `0` means the strings
712:	* have no similarities, and `1` means they are identical.
713:	*/
714:	public static function similarity(
715:	string $string1,
716:	string $string2,
717:	bool $normalise = false
718:	): float {
719:	if ($normalise) {
720:	$string1 = self::normalise($string1);
721:	$string2 = self::normalise($string2);
722:	}
723:
724:	if ($string1 === '' && $string2 === '') {
725:	return 1.0;
726:	}
727:
728:	return max(
729:	similar_text($string1, $string2),
730:	similar_text($string2, $string1),
731:	) / max(strlen($string1), strlen($string2));
732:	}
733:
734:	/**
735:	* Get ngrams shared between two strings relative to the number of ngrams in
736:	* the longest string
737:	*
738:	* @return float A value between `0` and `1`, where `0` means the strings
739:	* have no shared ngrams, and `1` means their ngrams are identical.
740:	*/
741:	public static function ngramSimilarity(
742:	string $string1,
743:	string $string2,
744:	bool $normalise = false,
745:	int $size = 2
746:	): float {
747:	return self::ngramScore(true, $string1, $string2, $normalise, $size);
748:	}
749:
750:	/**
751:	* Get ngrams shared between two strings relative to the number of ngrams in
752:	* the shortest string
753:	*
754:	* @return float A value between `0` and `1`, where `0` means the strings
755:	* have no shared ngrams, and `1` means their ngrams are identical.
756:	*/
757:	public static function ngramIntersection(
758:	string $string1,
759:	string $string2,
760:	bool $normalise = false,
761:	int $size = 2
762:	): float {
763:	return self::ngramScore(false, $string1, $string2, $normalise, $size);
764:	}
765:
766:	private static function ngramScore(
767:	bool $relativeToLongest,
768:	string $string1,
769:	string $string2,
770:	bool $normalise,
771:	int $size
772:	): float {
773:	if ($normalise) {
774:	$string1 = self::normalise($string1);
775:	$string2 = self::normalise($string2);
776:	}
777:
778:	if (strlen($string1) < $size && strlen($string2) < $size) {
779:	return 1.0;
780:	}
781:
782:	$ngrams1 = self::ngrams($string1, $size);
783:	$ngrams2 = self::ngrams($string2, $size);
784:	$count = $relativeToLongest
785:	? max(count($ngrams1), count($ngrams2))
786:	: min(count($ngrams1), count($ngrams2));
787:
788:	$same = 0;
789:	foreach ($ngrams1 as $ngram) {
790:	$key = array_search($ngram, $ngrams2, true);
791:	if ($key !== false) {
792:	$same++;
793:	unset($ngrams2[$key]);
794:	}
795:	}
796:
797:	return $same / $count;
798:	}
799:
800:	/**
801:	* Get a string's n-grams
802:	*
803:	* @return string[]
804:	*/
805:	public static function ngrams(string $string, int $size = 2): array
806:	{
807:	if (strlen($string) < $size) {
808:	return [];
809:	}
810:
811:	$ngrams = [];
812:	for ($i = 0; $i < $size; $i++) {
813:	$split = $i
814:	? substr($string, $i)
815:	: $string;
816:	$trim = strlen($split) % $size;
817:	if ($trim) {
818:	$split = substr($split, 0, -$trim);
819:	}
820:	if ($split === '') {
821:	continue;
822:	}
823:	/** @var string[] */
824:	$split = str_split($split, $size);
825:	$ngrams = array_merge($ngrams, $split);
826:	}
827:
828:	return $ngrams;
829:	}
830:
831:	/**
832:	* Group lists in a string by heading and remove duplicate items
833:	*
834:	* - Lines in `$text` are processed in order, from first to last
835:	* - If a non-empty line matches `$itemRegex`, it is treated as a list item,
836:	* otherwise it becomes the current heading
837:	* - The current heading is cleared when an empty line is encountered after
838:	* a list item (unless `$loose` is `true`)
839:	* - Top-level lines (headings with no items, and items with no heading) are
840:	* returned before lists with headings
841:	* - If `$itemRegex` has a named subpattern called `indent` that matches a
842:	* non-empty string, subsequent lines with indentation of the same width
843:	* are treated as a continuation of the item, along with any empty lines
844:	* between them
845:	*
846:	* @param string $listSeparator Inserted between headings and lists.
847:	* @param string\|null $headingPrefix Inserted before headings, e.g. `"-"`.
848:	* Indentation of the same width is applied to subsequent list items.
849:	* @param bool $clean If `true`, remove the first match of `$itemRegex` from
850:	* the beginning of each item with no heading.
851:	* @param bool $loose If `true`, do not clear the current heading when an
852:	* empty line is encountered.
853:	* @param bool $discardEmpty If `true`, discard headings with no items.
854:	* @param int<1,max> $tabSize
855:	*/
856:	public static function mergeLists(
857:	string $string,
858:	string $listSeparator = "\n",
859:	?string $headingPrefix = null,
860:	?string $itemRegex = Str::DEFAULT_ITEM_REGEX,
861:	bool $clean = false,
862:	bool $loose = false,
863:	bool $discardEmpty = false,
864:	string $eol = "\n",
865:	int $tabSize = 4
866:	): string {
867:	return (new ListMerger(
868:	$listSeparator,
869:	self::coalesce($headingPrefix, null),
870:	$itemRegex ?? self::DEFAULT_ITEM_REGEX,
871:	$clean,
872:	$loose,
873:	$discardEmpty,
874:	$eol,
875:	$tabSize,
876:	))->merge($string);
877:	}
878:	}
879:

Namespaces

Classes