影刀RPA数据去重实战:5种场景下的智能去重方案
作者:林焱
数据重复是RPA数据采集中的常见问题。本文针对5种典型去重场景,提供从简单到高级的完整解决方案,让你的数据干净整洁。
前言:重复数据——数据质量的头号敌人

在RPA数据采集过程中,重复数据几乎是不可避免的:
- 翻页采集时第一页数据重复
- 多次运行累积了重复记录
- 不同来源的数据存在交叉
- 同一实体的表述略有差异
不去重的后果:分析结论失真、报表数据虚高、决策依据错误。
场景一:Excel表格简单去重

1.1 单列去重
import openpyxl
def dedup_single_column(file_path, column=1):
"""按指定列去重,保留第一条"""
wb = openpyxl.load_workbook(file_path)
ws = wb.active
seen = set()
rows_to_delete = []
for row in range(2, ws.max_row + 1):
value = ws.cell(row=row, column=column).value
if value in seen:
rows_to_delete.append(row)
else:
seen.add(value)
# 从后往前删除,避免行号偏移
for row in reversed(rows_to_delete):
ws.delete_rows(row)
wb.save(file_path.replace('.xlsx', '_去重.xlsx'))
print(f"去重完成:删除 {len(rows_to_delete)} 行重复数据")
# 使用
dedup_single_column("客户数据.xlsx", column=1) # 按第一列去重
1.2 多列联合去重
拼多多店群自动化报活动上架!
def dedup_multi_column(file_path, columns=[1, 2, 3]):
"""按多列联合去重"""
wb = openpyxl.load_workbook(file_path)
ws = wb.active
seen = set()
rows_to_delete = []
for row in range(2, ws.max_row + 1):
# 组合多列值作为唯一键
key = tuple(ws.cell(row=row, column=col).value for col in columns)

if key in seen:
rows_to_delete.append(row)
else:
seen.add(key)
for row in reversed(rows_to_delete):
ws.delete_rows(row)
wb.save(file_path.replace('.xlsx', '_去重.xlsx'))
print(f"去重完成:删除 {len(rows_to_delete)} 行")
1.3 使用pandas去重(推荐)
import pandas as pd
def dedup_pandas(file_path, subset=None, keep='first'):
"""使用pandas去重,更高效"""
df = pd.read_excel(file_path)
original_count = len(df)
# subset: 指定去重列,None表示全部列
# keep: 'first'保留第一条, 'last'保留最后一条, False全部删除
df_dedup = df.drop_duplicates(subset=subset, keep=keep)
deduped_count = len(df_dedup)
removed = original_count - deduped_count
df_dedup.to_excel(file_path.replace('.xlsx', '_去重.xlsx'), index=False)
print(f"原始数据: {original_count} 行")
print(f"去重后: {deduped_count} 行")
print(f"删除: {removed} 行重复数据")
return df_dedup
# 使用示例
dedup_pandas("订单数据.xlsx", subset=["订单号"], keep='first')
dedup_pandas("客户数据.xlsx", subset=["姓名", "手机号"], keep='last')
场景二:数据库增量去重
2.1 INSERT OR REPLACE方案
import sqlite3
def incremental_insert(db_path, table_name, data_list, unique_keys):
"""增量插入,自动去重"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# 获取表结构
cursor.execute(f"PRAGMA table_info({table_name})")
columns = [col[1] for col in cursor.fetchall()]
# 创建唯一索引(如果不存在)
unique_cols = ', '.join(unique_keys)
index_name = f"idx_{'_'.join(unique_keys)}"
try:
cursor.execute(f"""
CREATE UNIQUE INDEX IF NOT EXISTS {index_name}
ON {table_name} ({unique_cols})
""")
except:
pass # 索引已存在
# INSERT OR REPLACE

placeholders = ', '.join(['?'] * len(columns))
col_str = ', '.join(columns)
inserted = 0
replaced = 0
for data in data_list:
values = [data.get(col) for col in columns]
try:
cursor.execute(f"""
INSERT OR REPLACE INTO {table_name} ({col_str})
VALUES ({placeholders})
""", values)
inserted += 1
except Exception as e:
print(f"插入失败: {e}")
conn.commit()
conn.close()
print(f"处理完成: {inserted} 条记录")
2.2 INSERT IGNORE方案(保留旧数据)
def insert_ignore(db_path, table_name, data_list, unique_keys):
"""插入时忽略重复,保留已有数据"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# 创建唯一索引
unique_cols = ', '.join(unique_keys)
index_name = f"idx_{'_'.join(unique_keys)}"
cursor.execute(f"""
CREATE UNIQUE INDEX IF NOT EXISTS {index_name}
ON {table_name} ({unique_cols})
""")
cursor.execute(f"PRAGMA table_info({table_name})")
columns = [col[1] for col in cursor.fetchall()]
placeholders = ', '.join(['?'] * len(columns))
col_str = ', '.join(columns)
new_count = 0
skip_count = 0
for data in data_list:
values = [data.get(col) for col in columns]
try:
cursor.execute(f"""
INSERT OR IGNORE INTO {table_name} ({col_str})
VALUES ({placeholders})
""", values)
if cursor.rowcount > 0:
new_count += 1
else:
skip_count += 1
except Exception as e:
print(f"错误: {e}")
conn.commit()
conn.close()
print(f"新增: {new_count}, 跳过重复: {skip_count}")
2.3 增量同步方案(对比差异)
def incremental_sync(source_data, db_path, table_name, unique_keys):
"""增量同步:新增+更新+标记删除"""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# 获取已有数据的唯一键集合
keys_str = ', '.join(unique_keys)
cursor.execute(f"SELECT {keys_str} FROM {table_name}")
existing_keys = set(cursor.fetchall())
# 获取源数据唯一键集合
source_keys = set()
for item in source_data:
key = tuple(item[k] for k in unique_keys)
source_keys.add(key)
# 计算差异
to_insert = source_keys - existing_keys # 新增
to_delete = existing_keys - source_keys # 删除
to_update = source_keys & existing_keys # 可能更新
print(f"新增: {len(to_insert)}, 删除: {len(to_delete)}, 可能更新: {len(to_update)}")
return to_insert, to_delete, to_update
场景三:模糊匹配去重

3.1 字符串相似度去重
from difflib import SequenceMatcher
def similarity(a, b):
"""计算两个字符串的相似度"""
return SequenceMatcher(None, str(a), str(b)).ratio()
def fuzzy_dedup(data_list, key_field, threshold=0.85):
"""模糊匹配去重"""
unique_items = []
duplicates = []
for item in data_list:
value = str(item.get(key_field, ''))
is_duplicate = False
for existing in unique_items:
existing_value = str(existing.get(key_field, ''))
sim = similarity(value, existing_value)
if sim >= threshold:
duplicates.append({
"original": existing_value,
"duplicate": value,
"similarity": round(sim, 3)
})
is_duplicate = True
break
if not is_duplicate:
unique_items.append(item)
print(f"原始: {len(data_list)}, 去重后: {len(unique_items)}, 重复: {len(duplicates)}")
return unique_items, duplicates
# 使用
data = [
{"name": "北京科技有限公司", "amount": 10000},
{"name": "北京科技有限责任公司", "amount": 12000}, # 相似
{"name": "上海创新科技", "amount": 8000},
{"name": "上海创新科技有限公司", "amount": 9000}, # 相似
]
unique, dupes = fuzzy_dedup(data, "name", threshold=0.8)
3.2 编辑距离去重
def levenshtein_distance(s1, s2):
"""计算编辑距离"""
if len(s1) < len(s2):
return levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def dedup_by_edit_distance(data_list, key_field, max_distance=2):
"""基于编辑距离的去重"""
unique = []
for item in data_list:
value = str(item.get(key_field, ''))
is_dup = False
for existing in unique:
existing_value = str(existing.get(key_field, ''))
dist = levenshtein_distance(value, existing_value)
if dist <= max_distance:
is_dup = True
break
if not is_dup:
unique.append(item)
return unique

场景四:多源数据合并去重
4.1 优先级合并
def merge_with_priority(sources, key_field, priority_order):
"""
多数据源合并去重,按优先级选择保留哪条
priority_order: 数据源优先级列表,越靠前优先级越高
"""
merged = {}
for source_name in reversed(priority_order):
data = sources.get(source_name, [])
for item in data:
key = item.get(key_field)
if key:
merged[key] = {
**item,
"_source": source_name
}
result = list(merged.values())
print(f"合并结果: {len(result)} 条(来自 {len(sources)} 个数据源)")
return result
# 使用
sources = {
"ERP系统": [
{"order_id": "ORD001", "amount": 100, "customer": "A公司"},
{"order_id": "ORD002", "amount": 200, "customer": "B公司"},
],
"CRM系统": [
{"order_id": "ORD001", "amount": 100, "customer": "A公司", "contact": "张三"},
{"order_id": "ORD003", "amount": 300, "customer": "C公司"},
]
}
# ERP优先级高于CRM
result = merge_with_priority(sources, "order_id", ["ERP系统", "CRM系统"])
4.2 字段级合并(取最优值)
def smart_merge(sources, key_field, field_rules):
"""
智能合并:每个字段根据规则选择最优值
field_rules: {"字段名": "规则"} 规则可以是 "newest", "non_empty", "max", "min"
"""
merged = {}
for source_name, data in sources.items():
for item in data:
key = item.get(key_field)
if not key:
continue
if key not in merged:
merged[key] = {}
for field, value in item.items():
if field == key_field:
merged[key][field] = value
continue
rule = field_rules.get(field, "newest")
existing = merged[key].get(field)
[video(video-JOHBHMHy-1782024122427)(type-csdn)(url-https://live.csdn.net/v/embed/526817)(image-https://v-

blog.csdnimg.cn/asset/1d3c3709da119dd8c13ab01e9b282520/cover/Cover0.jpg)(title-TEMU店群矩阵自动化运营核价报活动)]
if rule == "newest":
merged[key][field] = value
elif rule == "non_empty":
if value is not None and value != "":
merged[key][field] = value
elif rule == "max":
if existing is None or (value is not None and value > existing):
merged[key][field] = value
elif rule == "min":
if existing is None or (value is not None and value < existing):
merged[key][field] = value
return list(merged.values())
# 使用
field_rules = {
"amount": "max", # 金额取最大
"update_time": "newest", # 时间取最新
"contact": "non_empty", # 联系人取非空
"discount": "min" # 折扣取最小
}
场景五:大数据量高效去重
5.1 分批去重(内存友好)
def batch_dedup(file_path, key_column, batch_size=10000):
"""大数据量分批去重"""
import pandas as pd
seen_keys = set()
result_rows = []
# 分批读取
for chunk in pd.read_excel(file_path, chunksize=batch_size):
mask = ~chunk[key_column].isin(seen_keys)
new_rows = chunk[mask]
# 更新已见过的键
seen_keys.update(new_rows[key_column].tolist())
result_rows.append(new_rows)
print(f"已处理 {len(seen_keys)} 条唯一记录")
# 合并结果
result = pd.concat(result_rows, ignore_index=True)
result.to_excel(file_path.replace('.xlsx', '_去重.xlsx'), index=False)
print(f"去重完成: {len(result)} 条唯一记录")
return result
5.2 哈希去重(极致性能)
import hashlib
def hash_dedup(data_iterable, key_fields):
"""基于哈希的高效去重"""
seen_hashes = set()
unique_items = []
for item in data_iterable:
# 生成唯一哈希
key_str = '|'.join(str(item.get(f, '')) for f in key_fields)

item_hash = hashlib.md5(key_str.encode()).hexdigest()
if item_hash not in seen_hashes:
seen_hashes.add(item_hash)
unique_items.append(item)
return unique_items
5.3 Bloom Filter去重(超大数据)
class SimpleBloomFilter:
"""简易布隆过滤器,适用于海量数据去重"""
def __init__(self, size=1000000, hash_count=3):
self.size = size
self.hash_count = hash_count
self.bit_array = [False] * size
def _get_hashes(self, item):
"""获取多个哈希值"""
hashes = []
for i in range(self.hash_count):
h = hashlib.md5(f"{item}_{i}".encode()).hexdigest()
hashes.append(int(h, 16) % self.size)
return hashes
def add(self, item):
"""添加元素"""
for h in self._get_hashes(item):
self.bit_array[h] = True
def might_contain(self, item):
"""判断是否可能包含(可能有误判)"""
return all(self.bit_array[h] for h in self._get_hashes(item))
# 使用
bf = SimpleBloomFilter(size=10000000)
for item in data_stream:
key = f"{item['name']}|{item['phone']}"
if bf.might_contain(key):
continue # 可能重复,跳过
bf.add(key)
# 处理新数据...
去重方案选型指南
| 场景 | 数据量 | 推荐方案 | 优势 |
|---|---|---|---|
| Excel简单去重 | <10万行 | pandas drop_duplicates | 简单高效 |
| 数据库增量 | 任意 | INSERT OR IGNORE | 数据库原生支持 |
| 模糊匹配 | <1万条 | difflib相似度 | 容忍拼写差异 |
| 多源合并 | 中等 | 优先级+字段规则 | 灵活可控 |
| 超大数据 | >100万条 | 布隆过滤器 | 内存占用极低 |

结语
数据去重看似简单,实则涉及多种场景和策略。核心原则:
- 确定唯一键:先搞清楚什么算"重复"
- 选择保留策略:保留第一条、最后一条、还是最优字段
- 增量优于全量:持续运行用增量去重,一次性处理用全量去重
- 性能与精度权衡:大数据量用布隆过滤器,高精度用模糊匹配
记住:去重不只是删数据,更是确保数据质量的最后一道防线。


4820

被折叠的 条评论
为什么被折叠?



