问题
今天在 Hive 中 get_json_object 函数解析 json 串的时候,发现函数不支持解析 json 中文 key。
例如:
select get_json_object('{ "姓名":"张三" , "年龄":"18" }', '$.姓名');
我们希望的结果是得到姓名对应的值张三,而运行之后的结果为 NULL 值。
select get_json_object('{ "abc姓名":"张三" , "abc":"18" }', '$.abc姓名');
我们希望的结果是得到姓名对应的值张三,而运行之后的结果为 18 。
产生问题的原因
是什么原因导致的呢?我们查看 Hive 官网中 get_json_object 函数的介绍,可以发现 get_json_object 函数不能解析 json 里面中文的 key,如下图所示:

json 路径只能包含字符 [0-9a-z_],即不能包含 大写或特殊字符 。此外,键不能以数字开头。
那为什么 json 路径只能包含字符 [0-9a-z_] 呢?
通过查看源码我们发现 get_json_object 对应的 UDF 类的源码如下:
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.fasterxml.jackson.core.json.JsonReadFeature;
import com.fasterxml.jackson.databind.JavaType;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Iterators;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
/**
* UDFJson.
*/
@Description(name = "get_json_object",
value = "_FUNC_(json_txt, path) - Extract a json object from path ",
extended = "Extract json object from a json string based on json path "
+ "specified, and return json string of the extracted json object. It "
+ "will return null if the input json string is invalid.\n"
+ "A limited version of JSONPath supported:\n"
+ " $ : Root object\n"
+ " . : Child operator\n"
+ " [] : Subscript operator for array\n"
+ " * : Wildcard for []\n"
+ "Syntax not supported that's worth noticing:\n"
+ " '' : Zero length string as key\n"
+ " .. : Recursive descent\n"
+ " @ : Current object/element\n"
+ " () : Script expression\n"
+ " ?() : Filter (script) expression.\n"
+ " [,] : Union operator\n"
+ " [start:end:step] : array slice operator\n")
//定义了一个名为UDFJson的类,继承自UDF类。
public class UDFGetJsonObjectCN extends UDF {
//定义一个静态正则表达式模式,用于匹配JSON路径中的键。
//匹配英文key:匹配一个或多个大写字母、小写字母、数字、下划线、连字符、冒号或空格。
private static final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s]+).*");
//定义一个静态正则表达式模式,用于匹配JSON路径中的索引。
private static final Pattern patternIndex = Pattern.compile("\\[([0-9]+|\\*)\\]");
//创建一个ObjectMapper对象,用于解析JSON字符串。
private static final ObjectMapper objectMapper = new ObjectMapper();
//创建一个JavaType对象,用于表示Map类型。
private static final JavaType MAP_TYPE = objectMapper.getTypeFactory().constructType(Map.class);
//创建一个JavaType对象,用于表示List类型。
private static final JavaType LIST_TYPE = objectMapper.getTypeFactory().constructType(List.class);
//静态代码块,用于配置ObjectMapper的一些特性。
static {
// Allows for unescaped ASCII control characters in JSON values
objectMapper.enable(JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS.mappedFeature());
// Enabled to accept quoting of all character backslash qooting mechanism
objectMapper.enable(JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER.mappedFeature());
}
// An LRU cache using a linked hash map
//定义了一个静态内部类HashCache,用作LRU缓存。
static class HashCache<K, V> extends LinkedHashMap<K, V> {
private static final int CACHE_SIZE = 16;
private static final int INIT_SIZE = 32;
private static final float LOAD_FACTOR = 0.6f;
HashCache() {
super(INIT_SIZE, LOAD_FACTOR);
}
private static final long serialVersionUID = 1;
@Override
protected boolean removeEldestEntry(Map.Entry<K, V> eldest) {
return size() > CACHE_SIZE;
}
}
//声明了一个名为extractObjectCache的HashMap对象,用于缓存已提取的JSON对象。
Map<String, Object> extractObjectCache = new HashCache<String, Object>();
//声明了一个名为pathExprCache的HashMap对象,用于缓存已解析的JSON路径表达式。
Map<String, String[]> pathExprCache = new HashCache<String, String[]>();
//声明了一个名为indexListCache的HashMap对象,用于缓存已解析的JSON路径中的索引列表。
Map<String, ArrayList<String>> indexListCache =
new HashCache<String, ArrayList<String>>();
//声明了一个名为mKeyGroup1Cache的HashMap对象,用于缓存JSON路径中的键。
Map<String, String> mKeyGroup1Cache = new HashCache<String, String>();
//声明了一个名为mKeyMatchesCache的HashMap对象,用于缓存JSON路径中的键是否匹配的结果。
Map<String, Boolean> mKeyMatchesCache = new HashCache<String, Boolean>();
//构造函数,没有参数。
public UDFGetJsonObjectCN() {
}
/**
* Extract json object from a json string based on json path specified, and
* return json string of the extracted json object. It will return null if the
* input json string is invalid.
*
* A limited version of JSONPath supported: $ : Root object . : Child operator
* [] : Subscript operator for array * : Wildcard for []
*
* Syntax not supported that's worth noticing: '' : Zero length string as key
* .. : Recursive descent &#064; : Current object/element () : Script
* expression ?() : Filter (script) expression. [,] : Union operator
* [start:end:step] : array slice operator
*
* @param jsonString
* the json string.
* @param pathString
* the json path expression.
* @return json string or null when an error happens.
*/
//evaluate方法,用于提取指定路径的JSON对象并返回JSON字符串。
public Text evaluate(String jsonString, String pathString) {
if (jsonString == null || jsonString.isEmpty() || pathString == null
|| pathString.isEmpty() || pathString.charAt(0) != '$') {
return null;
}
int pathExprStart = 1;
boolean unknownType = pathString.equals("$");
boolean isRootArray = false;
if (pathString.length() > 1) {
if (pathString.charAt(1) == '[') {
pathExprStart = 0;
isRootArray = true;
} else if (pathString.charAt(1) == '.') {
isRootArray = pathString.length() > 2 && pathString.charAt(2) == '[';
} else {
return null;
}
}
// Cache pathExpr
String[] pathExpr = pathExprCache.get(pathString);
if (pathExpr == null) {
pathExpr = pathString.split("\\.", -1);
pathExprCache.put(pathString, pathExpr);
}
// Cache extractObject
Object extractObject = extractObjectCache.get(jsonString);
if (extractObject == null) {
if (unknownType) {
try {
extractObject = objectMapper.readValue(jsonString, LIST_TYPE);
} catch (Exception e) {
// Ignore exception
}
if (extractObject == null) {
try {
extractObject = objectMapper.readValue(jsonString, MAP_TYPE);
} catch (Exception e) {
return null;
}
}
} else {
JavaType javaType = isRootArray ? LIST_TYPE : MAP_TYPE;
try {
extractObject = objectMapper.readValue(jsonString, javaType);
} catch (Exception e) {
return null;
}
}
extractObjectCache.put(jsonString, extractObject);
}
for (int i = pathExprStart; i < pathExpr.length; i++) {
if (extractObject == null) {
return null;
}
extractObject = extract(extractObject, pathExpr[i], i == pathExprStart && isRootArray);
}
Text result = new Text();
if (extractObject instanceof Map || extractObject instanceof List) {
try {
result.set(objectMapper.writeValueAsString(extractObject));
} catch (Exception e) {
return null;
}
} else if (extractObject != null) {
result.set(extractObject.toString());
} else {
return null;
}
return result;
}
//extract方法,递归地提取JSON对象。
private Object extract(Object json, String path, boolean skipMapProc) {
// skip MAP processing for the first path element if root is array
if (!skipMapProc) {
// Cache patternkey.matcher(path).matches()
Matcher mKey = null;
Boolean mKeyMatches = mKeyMatchesCache.get(path);
if (mKeyMatches == null) {
mKey = patternKey.matcher(path);
mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
mKeyMatchesCache.put(path, mKeyMatches);
}
if (!mKeyMatches.booleanValue()) {
return null;
}
// Cache mkey.group(1)
String mKeyGroup1 = mKeyGroup1Cache.get(path);
if (mKeyGroup1 == null) {
if (mKey == null) {
mKey = patternKey.matcher(path);
mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE;
mKeyMatchesCache.put(path, mKeyMatches);
if (!mKeyMatches.booleanValue()) {
return null;
}
}
mKeyGroup1 = mKey.group(1);
mKeyGroup1Cache.put(path, mKeyGroup1);
}
json = extract_json_withkey(json, mKeyGroup1);
}
// Cache indexList
ArrayList<String> indexList = indexListCache.get(path);
if (indexList == null) {
Matcher mIndex = patternIndex.matcher(path);
indexList = new ArrayList<String>();
while (mIndex.find()) {
indexL

本文主要探讨了在Hive中使用函数解析JSON串时,函数不支持解析JSON中文key的问题。分析了产生问题的原因是匹配key的模式串限制,无法匹配中文key。并给出了解决办法,即修改代码中匹配key的正则表达式,还分别给出了Hive - 2.1.1和Hive - 4.0.0版本的处理方式。

2287

被折叠的 条评论
为什么被折叠?



