Python Script

#Tested with python 2.7
#Last modified 01/26/2018
#Dependency for this code: genderize (https://github.com/SteelPangolin/genderize)
#Dependency for genderize: requests v1 (installable via pip)
#Genderize installation via pip : pip install git+https://github.com/SteelPangolin/genderize.git

#imports
import csv, re, sys
from genderize import Genderize
from copy import deepcopy
import numpy as np

#logs console output, mostly for preliminary counts
#copied from http://stackoverflow.com/a/5916874
class Logger(object):
    def __init__(self, filename="Default.log"):
        self.terminal = sys.stdout
        self.log = open(filename, "a")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)

#does some processing of name lists, getting rid of any extra white space and initials
def process_names(name_list):
    items_to_be_removed = []
    items_ending_in_commas = []
    #removing initals e.g. "j" or "j."
    for list_item in name_list:
        if re.search('^[a-zA-z]$|^[a-zA-z]\.$', list_item):
            items_to_be_removed.append(list_item)
    for removal_items in items_to_be_removed:
        name_list.remove(removal_items)
    #some names had extra whitespace or something
    while '' in name_list:
        name_list.remove('')
    #now we need to remove some extra items ending in commas that arise because of names formatted as "Smith, Ph.D, John"
    for list_item in name_list:
        if list_item[-1] == ',':
            items_ending_in_commas.append(list_item)
    for removal_items in items_ending_in_commas[1:]:
        if removal_items in name_list:
            name_list.remove(removal_items)
    return name_list

#opens csv, actually calls processing functions and dedups inside
def extract_pis(csv_string):
    #open csv, had some dialect problems so next few lines are voodoo
    f = open(csv_string, 'rU')
    csvf = csv.reader(f)
    with f as csvfile:
       dialect = csv.Sniffer().sniff(csvfile.read(1024))
       csvfile.seek(0)
       reader = csv.reader(csvfile, dialect)

       #go through list of names and parse out first name, removing duplicates
       full_first_pis=[]
       full_other_pis=[]
       for row in reader:
           if len(row) > 14 and row[0] <> "Project Title":
               full_first_pis.append(get_first_PI(row))
               if get_other_PIs(row) <> "None":
                   full_other_pis.extend(get_other_PIs(row))
        #dedup based on last, firm
       full_first_pis = dedup_PI_list(full_first_pis)
       full_other_pis = dedup_PI_list(full_other_pis)
    return full_first_pis, full_other_pis

#based on last, first name list dedups
def dedup_PI_list(first_last_list):
    deduped_list = []
    for item in first_last_list:
        if item not in deduped_list:
            deduped_list.append(item)
    return deduped_list

#gets primary PIs specified row and processes, returns one name -- should be last, first
def get_first_PI(row):
    name = process_names(row[12].split(" "))
    # find the item that ends in ","
    index_of_comma = 0
    for ii in xrange(len(name)):
        if name[ii][-1] == ",":
            index_of_comma = ii
            break
    if len(name) >= index_of_comma + 2:
        return [name[index_of_comma].capitalize(), name[index_of_comma + 1].capitalize()]
    elif name == ['Unavailable']:
        return ['Unavailable,', 'Unavailable']
    #people with only initials, no first name
    elif len(name) == 1:
        name.append('Unavailable')
        return name
    else:
        raise ValueError('Something is wrong with name parsing!')

#gets other PIs in specified row and processes, returns list of names -- should be last, first
def get_other_PIs(row):
    list_of_names = []
    if row[13] == "Not Applicable":
        return "None"
    else:
        names = process_names(row[13].split(";"))
        for name in names:
            name = process_names(name.split(" "))
            # find the item that ends in ","
            index_of_comma = 0
            for ii in xrange(len(name)):
                if name[ii][-1] == ",":
                    index_of_comma = ii
                    break
            if len(name) >= index_of_comma + 2:
                list_of_names.append([name[index_of_comma].capitalize(), name[index_of_comma + 1].capitalize()])
            elif name == ['Unavailable']:
                return ['Unavailable,', 'Unavailable']
            # people with only initials, no first name
            elif len(name) == 1:
                name.append('Unavailable')
                return name
            else:
                raise ValueError('Something is wrong with name parsing!')
    return list_of_names

#API calling function, has to separate list into batches of 10, output dictionary
def get_genderization(list_of_names):
    first_name_dicts = {}
    for current_name in list_of_names:
        #You can comment this print out but it is helpful to see the progress of the genderization
        print current_name
        try:
            new_name = (Genderize().get([current_name]))[0]
        except:
            new_name= {u'count': "", u'gender': "", u'name': current_name, u'probability': ""}
        first_name_dicts[current_name] = new_name
    return first_name_dicts

#takes in api output dictionary, returns none, saves csv with specified output name
def export(genderized_list_of_dicts, output_name):
    #exports a genderized list of dicts to csv file named output_name
    with open(output_name, 'w') as csvfile:
        fieldnames = [u'count', u'gender', u'name', u'probability', u'full_name',u'num_pi']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for jj in xrange(len(genderized_list_of_dicts)):
            writer.writerow(genderized_list_of_dicts[jj])

#takes in api output dictionary, returns none, prints out preliminary count
def count_out_dict(name_dict):
    #prints counts of different genders, treating a name 51% probability of being female as being .51 female and .49 male
    sum_of_male_prob = 0
    sum_of_female_prob = 0
    for mm in xrange(len(name_dict)):
        if name_dict[mm][u'gender'] == "female":
            sum_of_male_prob += 1-name_dict[mm][u'probability']
            sum_of_female_prob += name_dict[mm][u'probability']
        elif name_dict[mm][u'gender'] == "male":
            sum_of_male_prob += name_dict[mm][u'probability']
            sum_of_female_prob += 1-name_dict[mm][u'probability']

    print "Preliminary Count"
    print "Female",sum_of_female_prob
    print "Male",sum_of_male_prob
    print "Current ratio",float(sum_of_female_prob)/(sum_of_female_prob+sum_of_male_prob)

def runGenderizationScript(file, keyword, exportAll):
#file should be 'input/<filename>', keyword should be '<descriptive keyword>'
    log_file = "output/log_"+keyword+".txt"
    export_file = "output/names_"+keyword+".csv"

    sys.stdout = Logger(log_file)

    first_pis, other_pis = extract_pis(file)
    if len(other_pis) > 0:
        all_pis = np.concatenate((first_pis,other_pis))
    else:
        all_pis = np.array(first_pis)

    first_names = [inner_list[1] for inner_list in all_pis]

    deduped_first_names = np.unique(first_names)
    print(len(deduped_first_names))

    genderized_all_pis = get_genderization(deduped_first_names)

    genderized_first_pis = [{u'full_name':" ".join(first_pi),u'name':first_pi[1]} for first_pi in first_pis]
    genderized_other_pis = [{u'full_name':" ".join(other_pi),u'name':other_pi[1]} for other_pi in other_pis]

    for author in genderized_first_pis:
        author.update(genderized_all_pis[author[u'name']])

    for author in genderized_other_pis:
        author.update(genderized_all_pis[author[u'name']])

    for entry in genderized_first_pis:
        entry[u'num_pi'] = 1
    for entry in genderized_other_pis:
        entry[u'num_pi'] = 2

    #check for first pi that is also other pi and remove
    cleaned_genderize_other_pis = []
    for pi_2 in genderized_other_pis:
        num_matches = 0
        for pi_1 in genderized_first_pis:
            if pi_1[u'full_name'] == pi_2[u'full_name']:
                num_matches +=1
        if num_matches == 0:
            cleaned_genderize_other_pis.append(pi_2)

    genderized_all_pis = deepcopy(genderized_first_pis)
    genderized_all_pis.extend(cleaned_genderize_other_pis)

    if exportAll:
        export(genderized_all_pis, export_file)

    print("===================================================================================")
    print("PRELIMINARY COUNTS")
    print("Please add in names with no assigned gender by hand and calculate this ratio again!")
    print("===================================================================================")
    count_out_dict(genderized_all_pis)

#TODO call the main function -- here is an example!
file ='input/SR_22Nov2016_113407_13085397.csv'
keyword = 'electrophysiology'

runGenderizationScript(file, keyword, 1)
Share this: