<?php
|
/*---------------------------------------------------------------*/
|
/*
|
Titre : Calcul en % la similarité entre 2 champs MySQL
|
|
URL : https://phpsources.net/code_s.php?id=333
|
Date édition : 31 Jan 2008
|
Date mise à jour : 23 Sept 2019
|
Rapport de la maj:
|
- refactoring du code en PHP 7
|
- fonctionnement du code vérifié
|
- correction du code
|
- modification de la description
|
*/
|
/*---------------------------------------------------------------*/
|
|
$db_server = 'localhost'; // Adresse du serveur MySQL
|
$db_name = ''; // Nom de la base de données
|
$db_user_login = 'root'; // Nom de l'utilisateur
|
$db_user_pass = ''; // Mot de passe de l'utilisateur
|
|
// Ouvre une connexion au serveur MySQL
|
$conn = mysqli_connect($db_server,$db_user_login, $db_user_pass, $db_name);
|
|
/*******************************************************************************
|
* Initialisation
|
***************************************************************************/
|
|
$valeur_pourcentage = 70; // la valeur du pourcentage de mots communs
|
// le doublon parfait c'est 100% de mots communs!!
|
// Donc $valeur_pourcentage sera initialiser à 100
|
// dans ce cas.
|
// Si vous tolerez 90% de mots communs, mettez 90 ;)
|
|
|
$table_sql = ""; // nom de la table sql
|
|
$champ_id = ""; // nom de votre identifiant
|
|
$champ_recherche_sql = ""; // nom du champ sur lequel on effectue la
|
// recherche de similarité
|
|
$nbre_elements = 100; // commencer petit, le script est long a executer
|
// le nombre d'element se trouve dna la premiere boucle de
|
// l'algo
|
// si vous souhaitez boucler sur la table entiere,
|
// remplacer
|
// $nbre_elements par sizeof($tab_matchv)
|
|
|
|
/*******************************************************************************
|
* enlève les accents
|
***************************************************************************/
|
|
|
function TexteSansAccent($texte){
|
|
$accent=
|
'à â¬Ã Âà âà  à âà â¦Ã  à ¡à ¢à  à ¤à ¥à âà âà âà â¢Ã â' .
|
'à Ëà ²à ³à ´à µà ¶à ¸à Ëà â°Ã Å Ã â¹Ã©Ã¨ÃªÃ «à â¡Ã §à ÅÃ' .
|
'Âà Žà Âà ¬à Âà ®à ¯à â¢Ã Å¡Ã âºÃ Åà ¹à ºà »à ¼à ¿à âà ±';
|
$noaccent='AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn';
|
$texte = strtr($texte,$accent,$noaccent);
|
return $texte;
|
|
}
|
|
/*******************************************************************************
|
* magic_quote
|
***************************************************************************/
|
|
function AuStrip_Slashes($chaine) {
|
return(get_magic_quotes_gpc() == 1 ? StripSlashes($chaine) : $chaine);}
|
|
/*******************************************************************************
|
* Stop Words !!
|
***************************************************************************/
|
|
$stop_words = array("alors","au","aucuns","aussi","autre","avant","avec","avoir"
|
|
,
|
"bon","car","ce","cela","ces","ceux", "chaque","ci","comme",
|
"comment","dans","des","du","dedans","dehors","depuis",
|
"deux",
|
"devrait","doit", "donc","dos","droite","début","elle",
|
"elles",
|
"en","encore","essai","est","et","eu","fait","faites",
|
"fois",
|
"font","force","haut","hors","ici","il","ils","je juste",
|
"la",
|
"le","les","leur","là ","ma",
|
"maintenant","mais","mes","mine","moins","mon","mot",
|
"même",
|
"ni",
|
"nommés","notre","nous","nouveaux",
|
"ou","oà¹","par","parce","parole","pas","personnes","peut"
|
,
|
"peu",
|
"pièce","plupart","pour","pourquoi",
|
"quand","que","quel","quelle","quelles","quels","qui","sa",
|
"sans","ses","seulement","si","sien","son",
|
"sont","sous","soyez sujet","sur","ta","tandis","tellement",
|
"tels","tes","ton","tous","tout","trop",
|
"très","tu","valeur","voie","voient","vont","votre",
|
"vous",
|
"vu","à§a","étaient","état","étions","été",
|
"être",
|
"un","deux","trois","quatre","cinq","six","sept","huit",
|
"neuf","dix",
|
"0","1","2","3","4","5","6","7","8","9","10",
|
"avec","chez","par","dans","des","en","de","une","votre",
|
"meilleurs","entre","entres",
|
"depuis","alors","ne","pas","du","meme",
|
"ou","nom","seuls","acceptes","ayant",
|
"mais","ou","et","donc","or","ni","car",
|
"vos","votre","mes","mien","mien","tien","tiens","tout",
|
"toute","toutes",
|
"que","quoi","qui","comment","peu","peut","pis","puis","pas"
|
|
,
|
"chaque","chacun","chacune",
|
"son","ses","au","aux","se","sur","ce","ceux","cette","ca",
|
"ci","ceci","cela","aussi","pour",
|
"petit","grand","moyen","large","haut","bas","milieu",
|
"droite",
|
"gauche","centre",
|
"dit","etre","leur","leurs",
|
"plus","moin","moins",
|
"es","est","sont","son","va","suis","ai","viens",
|
"a","b","c","d","e","f","g","h","i","l","m","n","o","p","q",
|
"r","s","t","u","v","w","x","y","z"
|
);
|
|
/*******************************************************************************
|
* Set le temps d'excecution
|
***************************************************************************/
|
|
set_time_limit(0);
|
|
|
|
/*******************************************************************************
|
* Premiere requete -place tout les champs dans un tableau
|
***************************************************************************/
|
|
$qv = $conn->query("SELECT $champ_id,$champ_recherche_sql FROM $table_sql");
|
$p=0;
|
$regs_split = array();
|
$tab_matchv = array();
|
while ($rv = mysqli_fetch_array($qv)) {
|
$tab_matchv[$p] = $rv[$champ_id];
|
$p++;
|
// Place tous les mots d'une chaine dans un tableau
|
|
$ch = $rv[$champ_recherche_sql];
|
// enleve les slashs
|
$ch = AuStrip_Slashes($ch);
|
// enleve les accent
|
$ch = TexteSansAccent($ch);// enleve les accent
|
// enleve les apostrophes
|
$ch = str_replace("'"," ",$ch);
|
// passe le texte en minuscule
|
$ch = strtolower($ch); // passe minuscule
|
$regs_split = preg_split("#[^[:alpha:]]+#", $ch);
|
// enleve les mots bannis en faisant le difference des 2 tableaux
|
$regs_split2 = array_diff($regs_split,$stop_words);
|
//print_r($regs_split2);
|
$rv = implode(',',$regs_split2);
|
//echo $rv; echo '<br />';
|
$tab_matchv[$p] = $rv;
|
$p++;
|
}
|
/*******************************************************************************
|
* seconde requete - place tout les champs dans un tableau
|
***************************************************************************/
|
|
$q = $conn->query("SELECT $champ_id,$champ_recherche_sql FROM $table_sql");
|
$i=0;
|
$regs1_split = array();
|
$tab_match = array();
|
while ($r = mysqli_fetch_array($q)) {
|
$tab_match[$i] = $r[$champ_id];
|
$i++;
|
// Place tous les mots d'une chaine dans un tableau
|
$ch = $r[$champ_recherche_sql];
|
// enleve les slashs
|
$ch = AuStrip_Slashes($ch);
|
// enleve les accent
|
$ch = TexteSansAccent($ch);// enleve les accent
|
// enleve les apostrophes
|
$ch = str_replace("'"," ",$ch);
|
// passe le texte en minuscule
|
$ch = strtolower($ch); // passe minuscule
|
|
|
$regs1_split = preg_split("#[^[:alpha:]]+#", $ch);
|
// enleve les mots bannis en faisant le difference des 2 tableaux
|
$regs1_split2 = array_diff($regs1_split,$stop_words);
|
$rv = implode(',',$regs1_split2);
|
|
$tab_match[$i] = $rv;
|
$i++;
|
}
|
|
/*******************************************************************************
|
* Sortie ecrans
|
***************************************************************************/
|
|
echo "<strong>Nombre de champs : ".round(sizeof($tab_match)/2)."<br /><br" .
|
" /></strong>";
|
echo "<strong>Verifie les $nbre_elements premiers éléments<br /><br />";
|
echo "<strong>Tolérance maximum de mots communs en pourcentage :</strong>
|
".$valeur_pourcentage."%</strong><br /><br />";
|
|
/*******************************************************************************
|
* l'algo
|
***************************************************************************/
|
|
$p = 0;
|
$k = 0;
|
$delete_id_tb1 = array();
|
$delete_id_tb2 = array();
|
|
while ($p < $nbre_elements)
|
{
|
$p++;
|
$j = 0;
|
while ($j < sizeof($tab_match))
|
{
|
$j++;
|
$regs = array();
|
$reception = array();
|
// on prend un champ du tableau
|
$ch_tab_match = $tab_match[$j];
|
// on decoupe la chaine et on la passe dans un tableau
|
$regs = preg_split('#[,]#', $ch_tab_match);
|
// idem avec la chaine a traiter
|
$reception = preg_split('#[,]#', $tab_matchv[$p]);
|
// on fait la difference
|
$non_matches = array_diff($regs, $reception);
|
// taille de la table
|
$non_matches_size = sizeof($non_matches);
|
// taille de la table
|
$regs_size = sizeof($regs);
|
// calcul le %
|
$inaccuracy = 0;
|
$inaccuracy = round(($non_matches_size/$regs_size)*100);
|
$accuracy = 100-$inaccuracy;
|
|
// echo le resultat en %
|
if ($accuracy > $valeur_pourcentage AND $accuracy != 100) {
|
// recupere l'ID
|
$val = $j-1;
|
$save_id = $tab_match[$j-1];
|
$val2 = $p-1;
|
$tab_matchv_id = $tab_matchv[$p-1];
|
$delete_id_tb1[$k] = $tab_matchv_id;
|
$delete_id_tb2[$k] = $save_id;
|
echo 'Il y a <strong><font color="#ff0000">'.$accuracy.'</font></strong>%
|
de contenu similaire entre: <br /><strong>ID: '.$tab_matchv_id.
|
'</strong>
|
>>'.$tab_matchv[$p].'<br /><strong>ID : '.$save_id.'</strong> >> ';
|
echo ''.$tab_match[$j].'<br /><br /><br />
|
';
|
$k++;
|
}
|
$j++;
|
}
|
$p++;
|
}
|
|
/*******************************************************************************
|
* Pour supprimer - A vous de voir sur quel table la suppression se fait
|
* Vous avez les 2 listes d'ID, le reste devrait etre un jeu d'enfant pour
|
vous ;)
|
***************************************************************************/
|
|
echo '<br /><br />Liste des ID de la premiere table<br />';
|
echo implode(',',array_unique($delete_id_tb1));
|
|
echo '<br /><br />Liste des ID de la seconde table<br />';
|
echo implode(',',array_unique($delete_id_tb2));
|
|
|
|
/*******************************************************************************
|
* Fin
|
***************************************************************************/
|
?>
|
|
|