Skip to content Skip to sidebar Skip to footer

Combine Dictionary Entries By Common Elements

I have a very big dictionary with keys containing a list of items. I would like to group all the keys that have at least one element in the list in common in an efficient way. For

Solution 1:

The program below solves the original problem. There may be a more efficient algorithm, but I think this one is reasonably fast.

It shouldn't be too hard to modify this code to cope with the more complex dict in the updated version of the problem.

(I'm using Python 2.6, so I don't have dict comprehensions, which is why I'm building dicts using generator expressions).

merge_lists.py

#! /usr/bin/env python

''' Merge lists in a dict 

    Lists are merged if they have any element in common,
    so that in the resulting dict no list element will be 
    associated with more than one key.

    Written by PM 2Ring 2014.11.18

    From http://stackoverflow.com/q/26972204/4014959
'''

#Some test data
groups = {
    'g01': ['a', 'b', 'c', 'd'], 
    'g02': ['a', 'b', 'c', 'd', 'e'], 
    'g03': ['f', 'g', 'h'], 
    'g04': ['g', 'j'],
    #'g05': ['g', 'a'],
    'g06': ['k', 'l'],
    'g07': ['l', 'm'],
    'g08': ['m', 'n'],
    'g09': ['o', 'p'],
    'g10': ['p', 'q'],
    'g11': ['q', 'o'],
    #'g12': ['l', 'q'],
}


def merge_lists(d):
    src = dict((k, set(v)) for k, v in d.iteritems())

    while True:
        dest = {}
        count = 0
        while src:
            k1, temp = src.popitem()
            if temp is None:
                continue
            for k2, v in src.iteritems():
                if v is None:
                    continue
                if temp & v:
                    temp |= v
                    src[k2] = None
                    count += 1
                    k1 = min(k1, k2)
            dest[k1] = temp

        if count > 0:
            #print count
            #print_dict(dest)
            src = dest
        else:
            dest = dict((k, sorted(list(v))) for k, v in dest.iteritems())
            return dest


def print_dict(d):
    for k in sorted(d.keys()):
        print "%s: %s" % (k, d[k])
    print


def main():
    print_dict(groups)
    print 20*'-'

    dest = merge_lists(groups)
    print_dict(dest)


if __name__ == '__main__':  
    main()

output

g02: ['a', 'b', 'c', 'd', 'e']
g03: ['f', 'g', 'h']
g04: ['g', 'j']
g06: ['k', 'l']
g07: ['l', 'm']
g08: ['m', 'n']
g09: ['o', 'p']
g10: ['p', 'q']
g11: ['q', 'o']

--------------------
g01: ['a', 'b', 'c', 'd', 'e']
g03: ['f', 'g', 'h', 'j']
g06: ['k', 'l', 'm', 'n']
g09: ['o', 'p', 'q']

Here's a version that works on the updated dict structure.

#! /usr/bin/env python

''' Merge lists in a dict 

    Lists are merged if they have any element in common,
    so that in the resulting dict no list element will be 
    associated with more than one key.

    The key of the merged item is selected from the sub-dict with the lowest
    value of oldest_node.

    Written by PM 2Ring 2014.11.21

    From http://stackoverflow.com/q/26972204/4014959
'''

#Some test data

groups = {
    'group1': {'IDs': ['a','b','c','d'], 'oldest_node': 'node_30'}, 
    'group2': {'IDs': ['c','d','e'], 'oldest_node': 'node_40'}, 
    'group3': {'IDs': ['h','k'], 'oldest_node': 'node_2'}, 
    'group4': {'IDs': ['z','w','x','j'], 'oldest_node': 'node_6'}, 
    'group5': {'IDs': ['h','z'], 'oldest_node': 'node_9'},
}


def merge_lists(d):
    #Convert IDs to a set and oldest_node to an int 
    src = {}
    for k, v in d.iteritems():
        src[k] = {
            'IDs': set(v['IDs']), 
            'oldest_node': int(v['oldest_node'][5:])
        } 
    #print_dict(src)

    while True:
        dest = {}
        count = 0
        while src:
            k1, temp = src.popitem()
            if temp is None:
                continue
            for k2, v in src.iteritems():
                if v is None:
                    continue
                if temp['IDs'] & v['IDs']:
                    #Merge IDs
                    temp['IDs'] |= v['IDs']

                    #Determine key of merge from oldest_node
                    if v['oldest_node'] < temp['oldest_node']:
                        k1 = k2
                        temp['oldest_node'] = v['oldest_node']
                    src[k2] = None
                    count += 1
            dest[k1] = temp

        src = dest

        #Exit loop if no changes occured 
        if count == 0:
            break
        else:
            #print count
            #print_dict(src)
            pass

    #Convert dict back to original form
    dest = {}
    for k, v in src.iteritems():
        dest[k] = {
            'IDs': sorted(list(v['IDs'])), 
            'oldest_node': 'node_%d' % v['oldest_node']
        }
    return dest


def print_dict(d):
    for k in sorted(d.keys()):
        print "%s: %s" % (k, d[k])
    print


def main():
    print_dict(groups)
    print 20*'-'

    dest = merge_lists(groups)
    print_dict(dest)


if __name__ == '__main__':  
    main()

output

group1: {'IDs': ['a', 'b', 'c', 'd'], 'oldest_node': 'node_30'}
group2: {'IDs': ['c', 'd', 'e'], 'oldest_node': 'node_40'}
group3: {'IDs': ['h', 'k'], 'oldest_node': 'node_2'}
group4: {'IDs': ['z', 'w', 'x', 'j'], 'oldest_node': 'node_6'}
group5: {'IDs': ['h', 'z'], 'oldest_node': 'node_9'}

--------------------
group1: {'IDs': ['a', 'b', 'c', 'd', 'e'], 'oldest_node': 'node_30'}
group3: {'IDs': ['h', 'j', 'k', 'w', 'x', 'z'], 'oldest_node': 'node_2'}

Solution 2:

Do you really want to change the dictionary that is used as input or is it okay if your function outputs another dictionary as a result?

Here is a quick and dirty function, that groups the values:

def group_dict(d):
    result = {}
    for k1 in d:
        for k2 in d:
            if k1 != k2 and set(d.get(k1)).intersection(d.get(k2)):
                result[k1] = list(set(d.get(k1)).union(d.get(k2)))
    return result

It should return:

{'group1': ['a', 'c', 'b', 'e', 'd'],
 'group2': ['a', 'c', 'b', 'e', 'd'],
 'group3': ['h', 'z', 'g', 'f'],
 'group4': ['h', 'z', 'g', 'f']}

Expand the function to remove the duplicates.

I'm using the built-in set and its methods intersection and union. That should be the key for your core requirements. In a double for-loop (very ugly) the values in the dictionary get compared and if intersection is found, the union of the values is converted to list and assigned to the result dictionary. It's really not a nice solution, but maybe it can give you some idea.


Post a Comment for "Combine Dictionary Entries By Common Elements"