1: <?php declare(strict_types=1);
2:
3: namespace Salient\Sync\Support;
4:
5: use Salient\Contract\Core\TextComparisonAlgorithm;
6: use Salient\Contract\Core\TextComparisonAlgorithm as Algorithm;
7: use Salient\Contract\Core\TextComparisonFlag;
8: use Salient\Contract\Core\TextComparisonFlag as Flag;
9: use Salient\Contract\Sync\SyncEntityInterface;
10: use Salient\Contract\Sync\SyncEntityProviderInterface;
11: use Salient\Contract\Sync\SyncEntityResolverInterface;
12: use Salient\Utility\Str;
13: use Closure;
14: use LogicException;
15:
16: /**
17: * Resolves a name to an entity using one or more text comparison algorithms
18: *
19: * @template TEntity of SyncEntityInterface
20: *
21: * @implements SyncEntityResolverInterface<TEntity>
22: */
23: final class SyncEntityFuzzyResolver implements SyncEntityResolverInterface
24: {
25: private const ALGORITHMS = [
26: Algorithm::SAME,
27: Algorithm::CONTAINS,
28: Algorithm::LEVENSHTEIN,
29: Algorithm::SIMILAR_TEXT,
30: Algorithm::NGRAM_SIMILARITY,
31: Algorithm::NGRAM_INTERSECTION,
32: ];
33:
34: /** @var SyncEntityProviderInterface<TEntity> */
35: private $EntityProvider;
36: /** @var string|Closure(TEntity): (string|null) */
37: private $NameProperty;
38: /** @var int-mask-of<Algorithm::*> */
39: private $Algorithm;
40: /** @var array<Algorithm::*,float>|float|null */
41: private $UncertaintyThreshold;
42: /** @var string|null */
43: private $WeightProperty;
44: /** @var bool */
45: private $RequireOneMatch;
46:
47: /**
48: * [ [ Entity, normalised name, weight ] ]
49: *
50: * @var array<array{TEntity,string,mixed|null}>|null
51: */
52: private $Entities;
53:
54: /**
55: * Query => [ entity, uncertainty ]
56: *
57: * @var array<string,array{TEntity|null,float|null}>
58: */
59: private $Cache = [];
60:
61: /**
62: * Creates a new SyncEntityFuzzyResolver object
63: *
64: * @param SyncEntityProviderInterface<TEntity> $entityProvider
65: * @param int-mask-of<TextComparisonAlgorithm::*|TextComparisonFlag::*> $algorithm
66: * @param array<TextComparisonAlgorithm::*,float>|float|null $uncertaintyThreshold
67: * @param string|null $weightProperty If multiple entities are equally
68: * similar to a given name, the one with the highest weight is preferred.
69: */
70: public function __construct(
71: SyncEntityProviderInterface $entityProvider,
72: ?string $nameProperty = null,
73: int $algorithm =
74: TextComparisonAlgorithm::SAME
75: | TextComparisonAlgorithm::CONTAINS
76: | TextComparisonAlgorithm::NGRAM_SIMILARITY
77: | TextComparisonFlag::NORMALISE,
78: $uncertaintyThreshold = null,
79: ?string $weightProperty = null,
80: bool $requireOneMatch = false
81: ) {
82: // Reduce $uncertaintyThreshold to values that will actually be applied
83: if (is_array($uncertaintyThreshold)) {
84: $uncertaintyThreshold = array_intersect_key(
85: $uncertaintyThreshold,
86: array_flip(self::ALGORITHMS),
87: );
88: foreach (array_keys($uncertaintyThreshold) as $key) {
89: if (!($algorithm & $key)) {
90: unset($uncertaintyThreshold[$key]);
91: }
92: }
93: if (!$uncertaintyThreshold) {
94: $uncertaintyThreshold = null;
95: }
96: }
97:
98: // Throw an exception if one match is required but the list of potential
99: // matches is never narrowed
100: if (
101: $requireOneMatch
102: && $uncertaintyThreshold === null
103: && !($algorithm & (Algorithm::SAME | Algorithm::CONTAINS))
104: ) {
105: throw new LogicException(
106: '$requireOneMatch cannot be true when $uncertaintyThreshold is null'
107: );
108: }
109:
110: $this->EntityProvider = $entityProvider;
111: $this->NameProperty =
112: $nameProperty === null
113: ? SyncIntrospector::get($entityProvider->entity())->getGetNameClosure()
114: : $nameProperty;
115: $this->Algorithm = $algorithm;
116: $this->UncertaintyThreshold = $uncertaintyThreshold;
117: $this->WeightProperty = $weightProperty;
118: $this->RequireOneMatch = $requireOneMatch;
119: }
120:
121: /**
122: * @inheritDoc
123: */
124: public function getByName(string $name, ?float &$uncertainty = null): ?SyncEntityInterface
125: {
126: if ($this->Entities === null) {
127: $this->loadEntities();
128: }
129:
130: if (!$this->Entities) {
131: $uncertainty = null;
132: return null;
133: }
134:
135: $name =
136: $this->Algorithm & Flag::NORMALISE
137: ? Str::normalise($name)
138: : $name;
139:
140: if (isset($this->Cache[$name])) {
141: [$entity, $uncertainty] = $this->Cache[$name];
142: return $entity;
143: }
144:
145: /** @var array<array{TEntity,string,mixed|null,...}> */
146: $entries = $this->Entities;
147: $applied = 0;
148:
149: foreach (self::ALGORITHMS as $algorithm) {
150: if (!($this->Algorithm & $algorithm)) {
151: continue;
152: }
153:
154: $threshold =
155: $this->RequireOneMatch
156: && $algorithm & (Algorithm::SAME | Algorithm::CONTAINS)
157: ? 1.0
158: : ($this->UncertaintyThreshold === null
159: ? null
160: : (is_array($this->UncertaintyThreshold)
161: ? ($this->UncertaintyThreshold[$algorithm] ?? null)
162: : $this->UncertaintyThreshold));
163:
164: // Skip this algorithm if it would achieve nothing
165: if ($this->RequireOneMatch && $threshold === null) {
166: continue;
167: }
168:
169: $next = [];
170: foreach ($entries as $entry) {
171: $entityName = $entry[1];
172: $entityUncertainty = $this->getUncertainty(
173: $name,
174: $entityName,
175: $algorithm,
176: );
177: if ($threshold !== null && (
178: ($threshold !== 0.0 && $entityUncertainty >= $threshold)
179: || ($threshold === 0.0 && $entityUncertainty > $threshold)
180: )) {
181: continue;
182: }
183: $entry[] = $entityUncertainty;
184: $next[] = $entry;
185: }
186:
187: // If there are no matching entities, try again with the next
188: // algorithm
189: if (!$next) {
190: continue;
191: }
192:
193: // If there is one matching entity, return it
194: if (count($next) === 1) {
195: return $this->cacheResult($name, $next[0], $uncertainty);
196: }
197:
198: // Otherwise, narrow the list of potential matches and continue
199: $entries = $next;
200: $applied++;
201: }
202:
203: if (!$applied || $this->RequireOneMatch) {
204: return $this->cacheResult($name, null, $uncertainty);
205: }
206:
207: usort(
208: $entries,
209: function ($e1, $e2) use ($applied) {
210: // Uncertainty values, most recent to least recent, ascending
211: for ($i = $applied + 2; $i >= 3; $i--) {
212: $result = $e1[$i] <=> $e2[$i];
213: if ($result) {
214: return $result;
215: }
216: }
217: // Weight, descending
218: return $e2[2] <=> $e1[2];
219: }
220: );
221:
222: return $this->cacheResult($name, $entries[0], $uncertainty);
223: }
224:
225: private function loadEntities(): void
226: {
227: $this->Entities = [];
228: foreach ($this->EntityProvider->getList() as $entity) {
229: $name =
230: is_string($this->NameProperty)
231: ? $entity->{$this->NameProperty}
232: : ($this->NameProperty)($entity);
233: $this->Entities[] = [
234: $entity,
235: $this->Algorithm & Flag::NORMALISE
236: ? Str::normalise($name)
237: : $name,
238: $this->WeightProperty === null
239: ? 0
240: : $entity->{$this->WeightProperty},
241: ];
242: }
243: }
244:
245: /**
246: * @param Algorithm::SAME|Algorithm::CONTAINS|Algorithm::LEVENSHTEIN|Algorithm::SIMILAR_TEXT|Algorithm::NGRAM_SIMILARITY|Algorithm::NGRAM_INTERSECTION $algorithm
247: */
248: private function getUncertainty(string $string1, string $string2, $algorithm): float
249: {
250: switch ($algorithm) {
251: case Algorithm::SAME:
252: return $string1 === $string2
253: ? 0.0
254: : 1.0;
255:
256: case Algorithm::CONTAINS:
257: return
258: strpos($string2, $string1) !== false
259: || strpos($string1, $string2) !== false
260: ? 0.0
261: : 1.0;
262:
263: case Algorithm::LEVENSHTEIN:
264: return Str::distance($string1, $string2);
265:
266: case Algorithm::SIMILAR_TEXT:
267: return 1 - Str::similarity($string1, $string2);
268:
269: case Algorithm::NGRAM_SIMILARITY:
270: return 1 - Str::ngramSimilarity($string1, $string2);
271:
272: case Algorithm::NGRAM_INTERSECTION:
273: return 1 - Str::ngramIntersection($string1, $string2);
274:
275: default:
276: throw new LogicException(sprintf(
277: 'Invalid algorithm: %d',
278: $this->Algorithm,
279: ));
280: }
281: }
282:
283: /**
284: * @param array{TEntity,string,mixed|null,...} $entry
285: * @return TEntity
286: */
287: private function cacheResult(string $name, ?array $entry, ?float &$uncertainty): ?SyncEntityInterface
288: {
289: if ($entry === null) {
290: $uncertainty = null;
291: $this->Cache[$name] = [null, null];
292: return null;
293: }
294:
295: $uncertainty = array_pop($entry);
296: $this->Cache[$name] = [$entry[0], $uncertainty];
297: return $entry[0];
298: }
299: }
300: