MySQL中文索引解决方案:二元分词法
SQL代码
$query = "SELECT title, MATCH( title_ft ) AGAINST( '$title_ft' IN BOOLEAN MODE ) AS score
FROM info
WHERE MATCH( title_ft ) AGAINST( '$title_ft' IN BOOLEAN MODE )
ORDER BY score DESC ";
其中 $title_ft是经过两个函数处理后的字符串,用它去匹配title_ft。
[解决]monkey的二元分词,分utf编码的中文时出现乱码
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html>
<head>
<title>momoca test</title>
</head>
<body>
<?php
function dualDecom($str)
{
//所有汉字后添加ASCII的0字符,
此法是为了排除特殊中文拆分错误的问题
$str = preg_replace("/[\x80-\xff]{3}/
","\\0".chr(0x00),$str);
//拆分的分割符
$search = array(",", "/", "\\", ".", ";", ":",
"\"", "!", "~", "`", "^", "(", ")", "?", "-",
"\t", "\n", "'", "<", ">", "\r", "\r\n",
"$", "&", "%", "#", "@", "+", "=", "{", "}",
"[", "]", ":", ")", "(", ".", "。", ",",
"!", ";", "“", "”", "‘", "’", "[", "]",
"、", "—", " ", "《", "》", "-", "…",
"【", "】",);
//替换所有的分割符为空格
$str = str_replace($search,' ',$str);
//用正则匹配半角单个字符或者全角单个字符,存入数组$ar
preg_match_all("/[\x80-\xff]+?\\x00/",$str,$ar);
$ar = $ar[0];
//去掉$ar中ASCII为0字符的项目
for ( $i = 0; $i < count($ar); $i++ )
if ($ar[$i] != chr(0x00)) $ar_new[]=$ar[$i];
$ar = $ar_new;
unset($ar_new);
$oldsw = 0;
//把连续的半角存成一个数组下标,
或者全角的每2个字符存成一个数组的下标
for ( $ar_str = '', $i = 0; $i < count($ar); $i++)
{
$sw=strlen($ar[$i]);
if ( $i > 0 and $sw != $oldsw) $ar_str.=" ";
if ( $sw == 1 )
$ar_str.= $ar[$i];
else
if ( strlen($ar[$i+1]) >= 2 )
$ar_str.= $ar[$i].$ar[$i+1].' ';
elseif ( $oldsw == 1 OR $oldsw == 0 )
$ar_str.= $ar[$i];
$oldsw=$sw;
}
//去掉连续的空格
$ar_str = trim(preg_replace("# {1,}#i"," ",$ar_str));
return explode(' ',$ar_str);
}
print_r(dualDecom('比如有一个字符串是
“你好PHP!”就只能分出“你好”一个词 '));
?>
</body>
</html>