影刀RPA数据去重实战:5种场景下的智能去重方案

影刀RPA数据去重实战:5种场景下的智能去重方案

作者:林焱

数据重复是RPA数据采集中的常见问题。本文针对5种典型去重场景,提供从简单到高级的完整解决方案,让你的数据干净整洁。


前言:重复数据——数据质量的头号敌人

在这里插入图片描述

在RPA数据采集过程中,重复数据几乎是不可避免的:

  • 翻页采集时第一页数据重复
  • 多次运行累积了重复记录
  • 不同来源的数据存在交叉
  • 同一实体的表述略有差异

不去重的后果:分析结论失真、报表数据虚高、决策依据错误。


场景一:Excel表格简单去重

在这里插入图片描述

1.1 单列去重

import openpyxl

def dedup_single_column(file_path, column=1):
    """按指定列去重,保留第一条"""
    wb = openpyxl.load_workbook(file_path)
    ws = wb.active
    
    seen = set()
    rows_to_delete = []
    
    for row in range(2, ws.max_row + 1):
        value = ws.cell(row=row, column=column).value
        if value in seen:
            rows_to_delete.append(row)
        else:
            seen.add(value)
    
    # 从后往前删除,避免行号偏移
    for row in reversed(rows_to_delete):
        ws.delete_rows(row)
    
    wb.save(file_path.replace('.xlsx', '_去重.xlsx'))
    print(f"去重完成:删除 {len(rows_to_delete)} 行重复数据")

# 使用
dedup_single_column("客户数据.xlsx", column=1)  # 按第一列去重

1.2 多列联合去重

拼多多店群自动化报活动上架!

def dedup_multi_column(file_path, columns=[1, 2, 3]):
    """按多列联合去重"""
    wb = openpyxl.load_workbook(file_path)
    ws = wb.active
    
    seen = set()
    rows_to_delete = []
    
    for row in range(2, ws.max_row + 1):
        # 组合多列值作为唯一键
        key = tuple(ws.cell(row=row, column=col).value for col in columns)
        
        ![在这里插入图片描述](https://i-blog.csdnimg.cn/direct/317bb4df2b4d41938494245d5218b2d2.png#pic_center)

        if key in seen:
            rows_to_delete.append(row)
        else:
            seen.add(key)
    
    for row in reversed(rows_to_delete):
        ws.delete_rows(row)
    
    wb.save(file_path.replace('.xlsx', '_去重.xlsx'))
    print(f"去重完成:删除 {len(rows_to_delete)} 行")

1.3 使用pandas去重(推荐)

import pandas as pd

def dedup_pandas(file_path, subset=None, keep='first'):
    """使用pandas去重,更高效"""
    df = pd.read_excel(file_path)
    
    original_count = len(df)
    
    # subset: 指定去重列,None表示全部列
    # keep: 'first'保留第一条, 'last'保留最后一条, False全部删除
    df_dedup = df.drop_duplicates(subset=subset, keep=keep)
    
    deduped_count = len(df_dedup)
    removed = original_count - deduped_count
    
    df_dedup.to_excel(file_path.replace('.xlsx', '_去重.xlsx'), index=False)
    print(f"原始数据: {original_count} 行")
    print(f"去重后: {deduped_count} 行")
    print(f"删除: {removed} 行重复数据")
    
    return df_dedup

# 使用示例
dedup_pandas("订单数据.xlsx", subset=["订单号"], keep='first')
dedup_pandas("客户数据.xlsx", subset=["姓名", "手机号"], keep='last')

场景二:数据库增量去重

2.1 INSERT OR REPLACE方案

import sqlite3

def incremental_insert(db_path, table_name, data_list, unique_keys):
    """增量插入,自动去重"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # 获取表结构
    cursor.execute(f"PRAGMA table_info({table_name})")
    columns = [col[1] for col in cursor.fetchall()]
    
    # 创建唯一索引(如果不存在)
    unique_cols = ', '.join(unique_keys)
    index_name = f"idx_{'_'.join(unique_keys)}"
    try:
        cursor.execute(f"""
            CREATE UNIQUE INDEX IF NOT EXISTS {index_name} 
            ON {table_name} ({unique_cols})
        """)
    except:
        pass  # 索引已存在
    
    # INSERT OR REPLACE
    ![在这里插入图片描述](https://i-blog.csdnimg.cn/direct/5940f53376ba4a0dbaa061fce84ea436.png#pic_center)

    placeholders = ', '.join(['?'] * len(columns))
    col_str = ', '.join(columns)
    
    inserted = 0
    replaced = 0
    
    for data in data_list:
        values = [data.get(col) for col in columns]
        try:
            cursor.execute(f"""
                INSERT OR REPLACE INTO {table_name} ({col_str})
                VALUES ({placeholders})
            """, values)
            inserted += 1
        except Exception as e:
            print(f"插入失败: {e}")
    
    conn.commit()
    conn.close()
    
    print(f"处理完成: {inserted} 条记录")

2.2 INSERT IGNORE方案(保留旧数据)

def insert_ignore(db_path, table_name, data_list, unique_keys):
    """插入时忽略重复,保留已有数据"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # 创建唯一索引
    unique_cols = ', '.join(unique_keys)
    index_name = f"idx_{'_'.join(unique_keys)}"
    cursor.execute(f"""
        CREATE UNIQUE INDEX IF NOT EXISTS {index_name}
        ON {table_name} ({unique_cols})
    """)
    
    cursor.execute(f"PRAGMA table_info({table_name})")
    columns = [col[1] for col in cursor.fetchall()]
    
    placeholders = ', '.join(['?'] * len(columns))
    col_str = ', '.join(columns)
    
    new_count = 0
    skip_count = 0
    
    for data in data_list:
        values = [data.get(col) for col in columns]
        try:
            cursor.execute(f"""
                INSERT OR IGNORE INTO {table_name} ({col_str})
                VALUES ({placeholders})
            """, values)
            if cursor.rowcount > 0:
                new_count += 1
            else:
                skip_count += 1
        except Exception as e:
            print(f"错误: {e}")
    
    conn.commit()
    conn.close()
    
    print(f"新增: {new_count}, 跳过重复: {skip_count}")

2.3 增量同步方案(对比差异)

def incremental_sync(source_data, db_path, table_name, unique_keys):
    """增量同步:新增+更新+标记删除"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # 获取已有数据的唯一键集合
    keys_str = ', '.join(unique_keys)
    cursor.execute(f"SELECT {keys_str} FROM {table_name}")
    existing_keys = set(cursor.fetchall())
    
    # 获取源数据唯一键集合
    source_keys = set()
    for item in source_data:
        key = tuple(item[k] for k in unique_keys)
        source_keys.add(key)
    
    # 计算差异
    to_insert = source_keys - existing_keys  # 新增
    to_delete = existing_keys - source_keys  # 删除
    to_update = source_keys & existing_keys  # 可能更新
    
    print(f"新增: {len(to_insert)}, 删除: {len(to_delete)}, 可能更新: {len(to_update)}")
    
    return to_insert, to_delete, to_update

场景三:模糊匹配去重

在这里插入图片描述

3.1 字符串相似度去重

from difflib import SequenceMatcher

def similarity(a, b):
    """计算两个字符串的相似度"""
    return SequenceMatcher(None, str(a), str(b)).ratio()

def fuzzy_dedup(data_list, key_field, threshold=0.85):
    """模糊匹配去重"""
    unique_items = []
    duplicates = []
    
    for item in data_list:
        value = str(item.get(key_field, ''))
        is_duplicate = False
        
        for existing in unique_items:
            existing_value = str(existing.get(key_field, ''))
            sim = similarity(value, existing_value)
            
            if sim >= threshold:
                duplicates.append({
                    "original": existing_value,
                    "duplicate": value,
                    "similarity": round(sim, 3)
                })
                is_duplicate = True
                break
        
        if not is_duplicate:
            unique_items.append(item)
    
    print(f"原始: {len(data_list)}, 去重后: {len(unique_items)}, 重复: {len(duplicates)}")
    return unique_items, duplicates

# 使用
data = [
    {"name": "北京科技有限公司", "amount": 10000},
    {"name": "北京科技有限责任公司", "amount": 12000},  # 相似
    {"name": "上海创新科技", "amount": 8000},
    {"name": "上海创新科技有限公司", "amount": 9000},  # 相似
]

unique, dupes = fuzzy_dedup(data, "name", threshold=0.8)

3.2 编辑距离去重

def levenshtein_distance(s1, s2):
    """计算编辑距离"""
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    
    if len(s2) == 0:
        return len(s1)
    
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

def dedup_by_edit_distance(data_list, key_field, max_distance=2):
    """基于编辑距离的去重"""
    unique = []
    for item in data_list:
        value = str(item.get(key_field, ''))
        is_dup = False
        for existing in unique:
            existing_value = str(existing.get(key_field, ''))
            dist = levenshtein_distance(value, existing_value)
            if dist <= max_distance:
                is_dup = True
                break
        if not is_dup:
            unique.append(item)
    return unique

在这里插入图片描述

场景四:多源数据合并去重

4.1 优先级合并

def merge_with_priority(sources, key_field, priority_order):
    """
    多数据源合并去重,按优先级选择保留哪条
    priority_order: 数据源优先级列表,越靠前优先级越高
    """
    merged = {}
    
    for source_name in reversed(priority_order):
        data = sources.get(source_name, [])
        for item in data:
            key = item.get(key_field)
            if key:
                merged[key] = {
                    **item,
                    "_source": source_name
                }
    
    result = list(merged.values())
    print(f"合并结果: {len(result)} 条(来自 {len(sources)} 个数据源)")
    return result

# 使用
sources = {
    "ERP系统": [
        {"order_id": "ORD001", "amount": 100, "customer": "A公司"},
        {"order_id": "ORD002", "amount": 200, "customer": "B公司"},
    ],
    "CRM系统": [
        {"order_id": "ORD001", "amount": 100, "customer": "A公司", "contact": "张三"},
        {"order_id": "ORD003", "amount": 300, "customer": "C公司"},
    ]
}

# ERP优先级高于CRM
result = merge_with_priority(sources, "order_id", ["ERP系统", "CRM系统"])

4.2 字段级合并(取最优值)

def smart_merge(sources, key_field, field_rules):
    """
    智能合并:每个字段根据规则选择最优值
    field_rules: {"字段名": "规则"} 规则可以是 "newest", "non_empty", "max", "min"
    """
    merged = {}
    
    for source_name, data in sources.items():
        for item in data:
            key = item.get(key_field)
            if not key:
                continue
            
            if key not in merged:
                merged[key] = {}
            
            for field, value in item.items():
                if field == key_field:
                    merged[key][field] = value
                    continue
                
                rule = field_rules.get(field, "newest")
                existing = merged[key].get(field)
                
                
[video(video-JOHBHMHy-1782024122427)(type-csdn)(url-https://live.csdn.net/v/embed/526817)(image-https://v-
![在这里插入图片描述](https://i-blog.csdnimg.cn/direct/9da48aa5d78f4710a5f68c42423c3941.png#pic_center)
blog.csdnimg.cn/asset/1d3c3709da119dd8c13ab01e9b282520/cover/Cover0.jpg)(title-TEMU店群矩阵自动化运营核价报活动)]

                if rule == "newest":
                    merged[key][field] = value
                elif rule == "non_empty":
                    if value is not None and value != "":
                        merged[key][field] = value
                elif rule == "max":
                    if existing is None or (value is not None and value > existing):
                        merged[key][field] = value
                elif rule == "min":
                    if existing is None or (value is not None and value < existing):
                        merged[key][field] = value
    
    return list(merged.values())

# 使用
field_rules = {
    "amount": "max",       # 金额取最大
    "update_time": "newest", # 时间取最新
    "contact": "non_empty",  # 联系人取非空
    "discount": "min"        # 折扣取最小
}

场景五:大数据量高效去重

5.1 分批去重(内存友好)

def batch_dedup(file_path, key_column, batch_size=10000):
    """大数据量分批去重"""
    import pandas as pd
    
    seen_keys = set()
    result_rows = []
    
    # 分批读取
    for chunk in pd.read_excel(file_path, chunksize=batch_size):
        mask = ~chunk[key_column].isin(seen_keys)
        new_rows = chunk[mask]
        
        # 更新已见过的键
        seen_keys.update(new_rows[key_column].tolist())
        
        result_rows.append(new_rows)
        print(f"已处理 {len(seen_keys)} 条唯一记录")
    
    # 合并结果
    result = pd.concat(result_rows, ignore_index=True)
    result.to_excel(file_path.replace('.xlsx', '_去重.xlsx'), index=False)
    print(f"去重完成: {len(result)} 条唯一记录")
    return result

5.2 哈希去重(极致性能)

import hashlib

def hash_dedup(data_iterable, key_fields):
    """基于哈希的高效去重"""
    seen_hashes = set()
    unique_items = []
    
    for item in data_iterable:
        # 生成唯一哈希
        key_str = '|'.join(str(item.get(f, '')) for f in key_fields)
        ![在这里插入图片描述](https://i-blog.csdnimg.cn/direct/4c6a644ed7c740a6865436eba2954b6c.png#pic_center)

        item_hash = hashlib.md5(key_str.encode()).hexdigest()
        
        if item_hash not in seen_hashes:
            seen_hashes.add(item_hash)
            unique_items.append(item)
    
    return unique_items

5.3 Bloom Filter去重(超大数据)

class SimpleBloomFilter:
    """简易布隆过滤器,适用于海量数据去重"""
    
    def __init__(self, size=1000000, hash_count=3):
        self.size = size
        self.hash_count = hash_count
        self.bit_array = [False] * size
    
    def _get_hashes(self, item):
        """获取多个哈希值"""
        hashes = []
        for i in range(self.hash_count):
            h = hashlib.md5(f"{item}_{i}".encode()).hexdigest()
            hashes.append(int(h, 16) % self.size)
        return hashes
    
    def add(self, item):
        """添加元素"""
        for h in self._get_hashes(item):
            self.bit_array[h] = True
    
    def might_contain(self, item):
        """判断是否可能包含(可能有误判)"""
        return all(self.bit_array[h] for h in self._get_hashes(item))

# 使用
bf = SimpleBloomFilter(size=10000000)

for item in data_stream:
    key = f"{item['name']}|{item['phone']}"
    if bf.might_contain(key):
        continue  # 可能重复,跳过
    bf.add(key)
    # 处理新数据...

去重方案选型指南

场景数据量推荐方案优势
Excel简单去重<10万行pandas drop_duplicates简单高效
数据库增量任意INSERT OR IGNORE数据库原生支持
模糊匹配<1万条difflib相似度容忍拼写差异
多源合并中等优先级+字段规则灵活可控
超大数据>100万条布隆过滤器内存占用极低

在这里插入图片描述

结语

数据去重看似简单,实则涉及多种场景和策略。核心原则:

  1. 确定唯一键:先搞清楚什么算"重复"
  2. 选择保留策略:保留第一条、最后一条、还是最优字段
  3. 增量优于全量:持续运行用增量去重,一次性处理用全量去重
  4. 性能与精度权衡:大数据量用布隆过滤器,高精度用模糊匹配

记住:去重不只是删数据,更是确保数据质量的最后一道防线
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值