实例代码如下:
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.Text;
public class TravelLabel {
//判别规则
protected static String encodeReg = "^(?:[\\x00-\\x7f]|[\\xe0-\\xef][\\x80-\\xbf]{2})+$";
// 以下三个方法用于判断url的编码格式是utf-8或gbk
public static Boolean isUTF8(String string) {
Pattern encode_pattern = Pattern.compile(encodeReg);
String unescaped_string = unescape(string);
Matcher encode_matcher = encode_pattern.matcher(unescaped_string);
if (encode_matcher.matches()) {
return true;
} else {
return false;
}
}
public static boolean isGBK(String string) throws UnsupportedEncodingException {
if (string.equals(new String(string.getBytes("GBK"))))
return true;
else
return false;
}
public static String unescape(String src) {
StringBuffer tmp = new StringBuffer();
tmp.ensureCapacity(src.length());
int lastPos = 0, pos = 0;
char ch;
while (lastPos < src.length()) {
pos = src.indexOf("%", lastPos);
if (pos == lastPos) {
if (src.charAt(pos + 1) == 'u') {
ch = (char) Integer.parseInt(src.substring(pos + 2, pos + 6), 16);
tmp.append(ch);
lastPos = pos + 6;
} else {
ch = (char) Integer.parseInt(src.substring(pos + 1, pos + 3), 16);
tmp.append(ch);
lastPos = pos + 3;
}
} else {
if (pos == -1) {
tmp.append(src.substring(lastPos));
lastPos = src.length();
} else {
tmp.append(src.substring(lastPos, pos));
lastPos = pos;
}
}
}
return tmp.toString();
}
try {
String arr[] = sef.get(host).split(",");
for (int i = 0; i < arr.length; i++) {
Pattern pa = Pattern.compile(arr[i]);
Matcher m = pa.matcher(url);
if (m.find()) {
list.add(m.group(1));
}
}
for (int i = 0; i < list.size(); i++) {
String str = list.get(i);
if (i < list.size() - 1) {
if (isUTF8(str)) {
travelRes = travelRes+URLDecoder.decode(str, "UTF-8").toString() + ",";
} else if (isGBK(str)) {
travelRes = travelRes+URLDecoder.decode(str, "GBK").toString() + ",";
}
} else {
if (isUTF8(str)) {
travelRes = travelRes+URLDecoder.decode(str, "UTF-8").toString() ;
} else if (isGBK(str)) {
travelRes = travelRes+URLDecoder.decode(str, "GBK").toString() ;
}
}
}
} catch (UnsupportedEncodingException e) {
} catch (IllegalArgumentException e2) {
} catch (StringIndexOutOfBoundsException e3) {
} catch (NullPointerException e4) {
}
return travelRes;
}
}
本文提供了Java代码示例,用于判断文件的编码是UTF-8还是GBK。

1万+

被折叠的 条评论
为什么被折叠?



