使用PHP借助DFA算法实现敏感词过滤功能,参照了zenghansen的代码,其中修改了原类的BUG,并增加了查找方法,并对王*八&&蛋这样的,中间填充了无意义的字符来混淆的词的增强匹配处理。
DFA敏感词过滤的原理就是将所有的敏感词载中内存,构建一个个树结构,然后将待匹配的字符串截断成数组,匹配每个数组元素与构建的敏感字典树的节点,匹配得到终节点就说明匹配成功。
对应PHP实现代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
| <?php
class SensitiveWordsUtils {
private $dict;
public function __construct($words) { $this->dict = array(); foreach ($words as $_word) { $uWord = $this->unicodeSplit($_word);
$pdict = &$this->dict;
$count = count($uWord); for ($i = 0; $i < $count; $i++) { if (!isset($pdict[$uWord[$i]])) { $pdict[$uWord[$i]] = array(); } $pdict = &$pdict[$uWord[$i]]; }
$pdict['end'] = true; } }
public function contains($str) { $uStr = $this->unicodeSplit($str); $count = count($uStr);
for ($i = 0; $i < $count; $i++) { $pdict = $this->dict; $char = strtolower($uStr[$i]); if (isset($pdict[$char])) { $pdict = $pdict[$char]; for ($j = $i + 1; $j < $count; $j++) { $char2 = strtolower($uStr[$j]); if (isset($pdict[$char2])) { if (isset($pdict['end'])) { return true; } $pdict = $pdict[$char2]; } else if (!preg_match("/[ +=*&$#@\"')(~_]/", $char2)) { break; } } if (isset($pdict['end'])) { return true; } } } return false; }
public function filter($str, $maxDistance = 5) { if ($maxDistance < 1) { $maxDistance = 1; } $uStr = $this->unicodeSplit($str, false);
$count = count($uStr);
for ($i = 0; $i < $count; $i++) { $pdict = $this->dict; $char = strtolower($uStr[$i]); if (isset($pdict[$char])) { $pdict = $pdict[$char];
$matchIndexes = array();
for ($j = $i + 1, $d = 0; $d < $maxDistance && $j < $count; $j++, $d++) { $char2 = strtolower($uStr[$j]); if (isset($pdict[$char2])) { if (isset($pdict['end'])) { $uStr[$i] = '*'; foreach ($matchIndexes as $k) { if ($k - $i == 1) { $i = $k; } $uStr[$k] = '*'; } $matchIndexes = array(); } $matchIndexes[] = $j; $pdict = $pdict[$char2]; $d = -1; } else if (!preg_match("/[ +=*&$#@\"')(~_]/", $char2)) { break; } }
if (isset($pdict['end'])) { $uStr[$i] = '*'; foreach ($matchIndexes as $k) { if ($k - $i == 1) { $i = $k; } $uStr[$k] = '*'; } } } }
return implode($uStr); }
public function unicodeSplit($str, $caseword = true) { if ($caseword) $str = strtolower($str); $ret = array(); $len = strlen($str); for ($i = 0; $i < $len; $i++) { $c = ord($str[$i]);
if ($c & 0x80) { if (($c & 0xf8) == 0xf0 && $len - $i >= 4) { if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80 && (ord($str[$i + 3]) & 0xc0) == 0x80) { $uc = substr($str, $i, 4); $ret[] = $uc; $i += 3; } } else if (($c & 0xf0) == 0xe0 && $len - $i >= 3) { if ((ord($str[$i + 1]) & 0xc0) == 0x80 && (ord($str[$i + 2]) & 0xc0) == 0x80) { $uc = substr($str, $i, 3); $ret[] = $uc; $i += 2; } } else if (($c & 0xe0) == 0xc0 && $len - $i >= 2) { if ((ord($str[$i + 1]) & 0xc0) == 0x80) { $uc = substr($str, $i, 2); $ret[] = $uc; $i += 1; } } } else { $ret[] = $str[$i]; } }
return $ret; } }
|
使用示例:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
| <?php
function loadWords() { $mysql_conf = array( 'host' => '127.0.0.1:3306', 'db' => 'test', 'db_user' => 'root', 'db_pwd' => '123456', ); $pdo = new PDO("mysql:host=" . $mysql_conf['host'] . ";dbname=" . $mysql_conf['db'], $mysql_conf['db_user'], $mysql_conf['db_pwd']); $pdo->exec("set names 'utf8'"); $sql = "select keyword from dtb_sensitive_words"; $stmt = $pdo->prepare($sql); $rs = $stmt->execute(); $words = array(); if ($rs) { while ($row = $stmt->fetch(PDO::FETCH_ASSOC)) { $words[] = $row['keyword']; } } $pdo = null; return $words; }
$words = loadWords(); $util = new SensitiveWordsUtils($words);
$param = "哪有宝宝穿越,是不是的啊大 三 元哈哈哈----"; echo $param . "\n";
if ($util->contains($param)) { echo $util->filter($param, 10); } else{ echo "没有敏感词"; }
|
如下,敏感词有:宝宝、宝宝穿越记、大三元,输出的结果为:
1 2 3
| 哪有宝宝穿越,是不是的啊大 三 元哈哈哈 哪有**穿越,是不是的啊* * *哈哈哈
|