| 1: | <?php declare(strict_types=1); |
| 2: | |
| 3: | namespace Salient\Sync\Support; |
| 4: | |
| 5: | use Salient\Contract\Sync\SyncEntityInterface; |
| 6: | use Salient\Contract\Sync\SyncEntityProviderInterface; |
| 7: | use Salient\Contract\Sync\SyncEntityResolverInterface; |
| 8: | use Salient\Utility\Reflect; |
| 9: | use Salient\Utility\Str; |
| 10: | use Closure; |
| 11: | use InvalidArgumentException; |
| 12: | |
| 13: | |
| 14: | |
| 15: | |
| 16: | |
| 17: | |
| 18: | |
| 19: | |
| 20: | |
| 21: | |
| 22: | |
| 23: | |
| 24: | |
| 25: | final class SyncEntityFuzzyResolver implements SyncEntityResolverInterface |
| 26: | { |
| 27: | public const DEFAULT_FLAGS = |
| 28: | SyncEntityFuzzyResolver::ALGORITHM_SAME |
| 29: | | SyncEntityFuzzyResolver::ALGORITHM_CONTAINS |
| 30: | | SyncEntityFuzzyResolver::ALGORITHM_NGRAM_SIMILARITY |
| 31: | | SyncEntityFuzzyResolver::NORMALISE; |
| 32: | |
| 33: | private const FUZZY_ALGORITHMS = |
| 34: | self::ALGORITHM_LEVENSHTEIN |
| 35: | | self::ALGORITHM_SIMILAR_TEXT |
| 36: | | self::ALGORITHM_NGRAM_SIMILARITY |
| 37: | | self::ALGORITHM_NGRAM_INTERSECTION; |
| 38: | |
| 39: | |
| 40: | private SyncEntityProviderInterface $EntityProvider; |
| 41: | private bool $HasNameProperty; |
| 42: | private string $NameProperty; |
| 43: | |
| 44: | private Closure $GetNameClosure; |
| 45: | |
| 46: | private int $Flags; |
| 47: | |
| 48: | private array $UncertaintyThreshold; |
| 49: | private bool $HasWeightProperty; |
| 50: | private string $WeightProperty; |
| 51: | |
| 52: | private ?Closure $GetWeightClosure; |
| 53: | private bool $RequireOneMatch; |
| 54: | |
| 55: | |
| 56: | |
| 57: | |
| 58: | |
| 59: | |
| 60: | private array $Entities; |
| 61: | |
| 62: | |
| 63: | |
| 64: | |
| 65: | |
| 66: | |
| 67: | private array $Cache = []; |
| 68: | |
| 69: | |
| 70: | |
| 71: | |
| 72: | |
| 73: | |
| 74: | |
| 75: | |
| 76: | |
| 77: | |
| 78: | |
| 79: | |
| 80: | |
| 81: | |
| 82: | |
| 83: | public function __construct( |
| 84: | SyncEntityProviderInterface $entityProvider, |
| 85: | $nameProperty = null, |
| 86: | int $flags = SyncEntityFuzzyResolver::DEFAULT_FLAGS, |
| 87: | $uncertaintyThreshold = null, |
| 88: | $weightProperty = null, |
| 89: | bool $requireOneMatch = false |
| 90: | ) { |
| 91: | $algorithms = $this->getAlgorithms($flags); |
| 92: | if (!$algorithms) { |
| 93: | throw new InvalidArgumentException('At least one algorithm flag must be set'); |
| 94: | } |
| 95: | if (!is_array($uncertaintyThreshold)) { |
| 96: | $uncertaintyThreshold = array_fill_keys($algorithms, $uncertaintyThreshold); |
| 97: | } |
| 98: | $this->UncertaintyThreshold = []; |
| 99: | foreach ($algorithms as $algorithm) { |
| 100: | $threshold = $uncertaintyThreshold[$algorithm] ?? null; |
| 101: | if ($requireOneMatch && $threshold === null) { |
| 102: | if ($algorithm & self::FUZZY_ALGORITHMS) { |
| 103: | throw new InvalidArgumentException(sprintf( |
| 104: | 'Invalid $uncertaintyThreshold for %s when $requireOneMatch is true', |
| 105: | Reflect::getConstantName(self::class, $algorithm), |
| 106: | )); |
| 107: | } else { |
| 108: | $threshold = 1.0; |
| 109: | } |
| 110: | } |
| 111: | $this->UncertaintyThreshold[$algorithm] = $threshold; |
| 112: | } |
| 113: | |
| 114: | $this->EntityProvider = $entityProvider; |
| 115: | if (is_string($nameProperty)) { |
| 116: | $this->HasNameProperty = true; |
| 117: | $this->NameProperty = $nameProperty; |
| 118: | } else { |
| 119: | $this->HasNameProperty = false; |
| 120: | $this->GetNameClosure = $nameProperty |
| 121: | ?? SyncIntrospector::get($entityProvider->entity()) |
| 122: | ->getGetNameClosure(); |
| 123: | } |
| 124: | $this->Flags = $flags; |
| 125: | if (is_string($weightProperty)) { |
| 126: | $this->HasWeightProperty = true; |
| 127: | $this->WeightProperty = $weightProperty; |
| 128: | } else { |
| 129: | $this->HasWeightProperty = false; |
| 130: | $this->GetWeightClosure = $weightProperty; |
| 131: | } |
| 132: | $this->RequireOneMatch = $requireOneMatch; |
| 133: | } |
| 134: | |
| 135: | |
| 136: | |
| 137: | |
| 138: | public function getByName(string $name, ?float &$uncertainty = null): ?SyncEntityInterface |
| 139: | { |
| 140: | $this->Entities ??= $this->getEntities(); |
| 141: | |
| 142: | if (!$this->Entities) { |
| 143: | $uncertainty = null; |
| 144: | return null; |
| 145: | } |
| 146: | |
| 147: | if ($this->Flags & self::NORMALISE) { |
| 148: | $name = Str::normalise($name); |
| 149: | } |
| 150: | |
| 151: | if (isset($this->Cache[$name])) { |
| 152: | [$entity, $uncertainty] = $this->Cache[$name]; |
| 153: | return $entity; |
| 154: | } |
| 155: | |
| 156: | $entries = $this->Entities; |
| 157: | $applied = 0; |
| 158: | foreach ($this->UncertaintyThreshold as $algorithm => $threshold) { |
| 159: | $next = []; |
| 160: | foreach ($entries as $entry) { |
| 161: | $entityName = $entry[1]; |
| 162: | $entityUncertainty = $this->getUncertainty($name, $entityName, $algorithm); |
| 163: | if ($threshold !== null && ( |
| 164: | ($threshold !== 0.0 && $entityUncertainty >= $threshold) |
| 165: | || ($threshold === 0.0 && $entityUncertainty > $threshold) |
| 166: | )) { |
| 167: | continue; |
| 168: | } |
| 169: | $entry[] = $entityUncertainty; |
| 170: | $next[] = $entry; |
| 171: | } |
| 172: | |
| 173: | |
| 174: | |
| 175: | if (!$next) { |
| 176: | continue; |
| 177: | } |
| 178: | |
| 179: | |
| 180: | if (count($next) === 1) { |
| 181: | return $this->cacheResult($name, $next[0], $uncertainty); |
| 182: | } |
| 183: | |
| 184: | |
| 185: | $entries = $next; |
| 186: | $applied++; |
| 187: | } |
| 188: | |
| 189: | if (!$applied || $this->RequireOneMatch) { |
| 190: | return $this->cacheResult($name, null, $uncertainty); |
| 191: | } |
| 192: | |
| 193: | |
| 194: | |
| 195: | |
| 196: | |
| 197: | usort( |
| 198: | $entries, |
| 199: | function ($e1, $e2) use ($applied) { |
| 200: | for ($i = $applied + 2; $i > 2; $i--) { |
| 201: | if ($result = $e1[$i] <=> $e2[$i]) { |
| 202: | return $result; |
| 203: | } |
| 204: | } |
| 205: | return $e2[2] <=> $e1[2]; |
| 206: | } |
| 207: | ); |
| 208: | |
| 209: | |
| 210: | return $this->cacheResult($name, $entries[0], $uncertainty); |
| 211: | } |
| 212: | |
| 213: | |
| 214: | |
| 215: | |
| 216: | private function getAlgorithms(int $flags): array |
| 217: | { |
| 218: | foreach ([ |
| 219: | self::ALGORITHM_SAME, |
| 220: | self::ALGORITHM_CONTAINS, |
| 221: | self::ALGORITHM_LEVENSHTEIN, |
| 222: | self::ALGORITHM_SIMILAR_TEXT, |
| 223: | self::ALGORITHM_NGRAM_SIMILARITY, |
| 224: | self::ALGORITHM_NGRAM_INTERSECTION, |
| 225: | ] as $algorithm) { |
| 226: | if ($flags & $algorithm) { |
| 227: | $algorithms[] = $algorithm; |
| 228: | } |
| 229: | } |
| 230: | |
| 231: | return $algorithms ?? []; |
| 232: | } |
| 233: | |
| 234: | |
| 235: | |
| 236: | |
| 237: | private function getEntities(): array |
| 238: | { |
| 239: | foreach ($this->EntityProvider->getList() as $entity) { |
| 240: | if ($this->HasNameProperty) { |
| 241: | $name = $entity->{$this->NameProperty} ?? null; |
| 242: | if (!is_string($name)) { |
| 243: | continue; |
| 244: | } |
| 245: | } else { |
| 246: | $name = ($this->GetNameClosure)($entity); |
| 247: | } |
| 248: | if ($this->Flags & self::NORMALISE) { |
| 249: | $name = Str::normalise($name); |
| 250: | } |
| 251: | if ($this->HasWeightProperty) { |
| 252: | $weight = $entity->{$this->WeightProperty} ?? null; |
| 253: | if (!(is_int($weight) || is_float($weight))) { |
| 254: | $weight = \PHP_INT_MIN; |
| 255: | } |
| 256: | } elseif ($this->GetWeightClosure) { |
| 257: | $weight = ($this->GetWeightClosure)($entity); |
| 258: | } else { |
| 259: | $weight = \PHP_INT_MIN; |
| 260: | } |
| 261: | $entities[] = [$entity, $name, $weight]; |
| 262: | } |
| 263: | |
| 264: | return $entities ?? []; |
| 265: | } |
| 266: | |
| 267: | |
| 268: | |
| 269: | |
| 270: | private function getUncertainty(string $string1, string $string2, int $algorithm): float |
| 271: | { |
| 272: | switch ($algorithm) { |
| 273: | case self::ALGORITHM_SAME: |
| 274: | return $string1 === $string2 |
| 275: | ? 0.0 |
| 276: | : 1.0; |
| 277: | |
| 278: | case self::ALGORITHM_CONTAINS: |
| 279: | return strpos($string2, $string1) !== false |
| 280: | || strpos($string1, $string2) !== false |
| 281: | ? 0.0 |
| 282: | : 1.0; |
| 283: | |
| 284: | case self::ALGORITHM_LEVENSHTEIN: |
| 285: | return Str::distance($string1, $string2); |
| 286: | |
| 287: | case self::ALGORITHM_SIMILAR_TEXT: |
| 288: | return 1.0 - Str::similarity($string1, $string2); |
| 289: | |
| 290: | case self::ALGORITHM_NGRAM_SIMILARITY: |
| 291: | return 1.0 - Str::ngramSimilarity($string1, $string2); |
| 292: | |
| 293: | case self::ALGORITHM_NGRAM_INTERSECTION: |
| 294: | return 1.0 - Str::ngramIntersection($string1, $string2); |
| 295: | } |
| 296: | } |
| 297: | |
| 298: | |
| 299: | |
| 300: | |
| 301: | |
| 302: | private function cacheResult(string $name, ?array $entry, ?float &$uncertainty): ?SyncEntityInterface |
| 303: | { |
| 304: | if ($entry === null) { |
| 305: | $uncertainty = null; |
| 306: | $this->Cache[$name] = [null, null]; |
| 307: | return null; |
| 308: | } |
| 309: | |
| 310: | |
| 311: | $last = array_pop($entry); |
| 312: | $uncertainty = $last; |
| 313: | $this->Cache[$name] = [$entry[0], $uncertainty]; |
| 314: | return $entry[0]; |
| 315: | } |
| 316: | } |
| 317: | |