#Tested with python 2.7
#Last modified 01/26/2018
#Dependency for this code: genderize (https://github.com/SteelPangolin/genderize)
#Dependency for genderize: requests v1 (installable via pip)
#Genderize installation via pip : pip install git+https://github.com/SteelPangolin/genderize.git
#imports
import csv, re, sys
from genderize import Genderize
from copy import deepcopy
import numpy as np
#logs console output, mostly for preliminary counts
#copied from http://stackoverflow.com/a/5916874
class Logger(object):
def __init__(self, filename="Default.log"):
self.terminal = sys.stdout
self.log = open(filename, "a")
def write(self, message):
self.terminal.write(message)
self.log.write(message)
#does some processing of name lists, getting rid of any extra white space and initials
def process_names(name_list):
items_to_be_removed = []
items_ending_in_commas = []
#removing initals e.g. "j" or "j."
for list_item in name_list:
if re.search('^[a-zA-z]$|^[a-zA-z]\.$', list_item):
items_to_be_removed.append(list_item)
for removal_items in items_to_be_removed:
name_list.remove(removal_items)
#some names had extra whitespace or something
while '' in name_list:
name_list.remove('')
#now we need to remove some extra items ending in commas that arise because of names formatted as "Smith, Ph.D, John"
for list_item in name_list:
if list_item[-1] == ',':
items_ending_in_commas.append(list_item)
for removal_items in items_ending_in_commas[1:]:
if removal_items in name_list:
name_list.remove(removal_items)
return name_list
#opens csv, actually calls processing functions and dedups inside
def extract_pis(csv_string):
#open csv, had some dialect problems so next few lines are voodoo
f = open(csv_string, 'rU')
csvf = csv.reader(f)
with f as csvfile:
dialect = csv.Sniffer().sniff(csvfile.read(1024))
csvfile.seek(0)
reader = csv.reader(csvfile, dialect)
#go through list of names and parse out first name, removing duplicates
full_first_pis=[]
full_other_pis=[]
for row in reader:
if len(row) > 14 and row[0] <> "Project Title":
full_first_pis.append(get_first_PI(row))
if get_other_PIs(row) <> "None":
full_other_pis.extend(get_other_PIs(row))
#dedup based on last, firm
full_first_pis = dedup_PI_list(full_first_pis)
full_other_pis = dedup_PI_list(full_other_pis)
return full_first_pis, full_other_pis
#based on last, first name list dedups
def dedup_PI_list(first_last_list):
deduped_list = []
for item in first_last_list:
if item not in deduped_list:
deduped_list.append(item)
return deduped_list
#gets primary PIs specified row and processes, returns one name -- should be last, first
def get_first_PI(row):
name = process_names(row[12].split(" "))
# find the item that ends in ","
index_of_comma = 0
for ii in xrange(len(name)):
if name[ii][-1] == ",":
index_of_comma = ii
break
if len(name) >= index_of_comma + 2:
return [name[index_of_comma].capitalize(), name[index_of_comma + 1].capitalize()]
elif name == ['Unavailable']:
return ['Unavailable,', 'Unavailable']
#people with only initials, no first name
elif len(name) == 1:
name.append('Unavailable')
return name
else:
raise ValueError('Something is wrong with name parsing!')
#gets other PIs in specified row and processes, returns list of names -- should be last, first
def get_other_PIs(row):
list_of_names = []
if row[13] == "Not Applicable":
return "None"
else:
names = process_names(row[13].split(";"))
for name in names:
name = process_names(name.split(" "))
# find the item that ends in ","
index_of_comma = 0
for ii in xrange(len(name)):
if name[ii][-1] == ",":
index_of_comma = ii
break
if len(name) >= index_of_comma + 2:
list_of_names.append([name[index_of_comma].capitalize(), name[index_of_comma + 1].capitalize()])
elif name == ['Unavailable']:
return ['Unavailable,', 'Unavailable']
# people with only initials, no first name
elif len(name) == 1:
name.append('Unavailable')
return name
else:
raise ValueError('Something is wrong with name parsing!')
return list_of_names
#API calling function, has to separate list into batches of 10, output dictionary
def get_genderization(list_of_names):
first_name_dicts = {}
for current_name in list_of_names:
#You can comment this print out but it is helpful to see the progress of the genderization
print current_name
try:
new_name = (Genderize().get([current_name]))[0]
except:
new_name= {u'count': "", u'gender': "", u'name': current_name, u'probability': ""}
first_name_dicts[current_name] = new_name
return first_name_dicts
#takes in api output dictionary, returns none, saves csv with specified output name
def export(genderized_list_of_dicts, output_name):
#exports a genderized list of dicts to csv file named output_name
with open(output_name, 'w') as csvfile:
fieldnames = [u'count', u'gender', u'name', u'probability', u'full_name',u'num_pi']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for jj in xrange(len(genderized_list_of_dicts)):
writer.writerow(genderized_list_of_dicts[jj])
#takes in api output dictionary, returns none, prints out preliminary count
def count_out_dict(name_dict):
#prints counts of different genders, treating a name 51% probability of being female as being .51 female and .49 male
sum_of_male_prob = 0
sum_of_female_prob = 0
for mm in xrange(len(name_dict)):
if name_dict[mm][u'gender'] == "female":
sum_of_male_prob += 1-name_dict[mm][u'probability']
sum_of_female_prob += name_dict[mm][u'probability']
elif name_dict[mm][u'gender'] == "male":
sum_of_male_prob += name_dict[mm][u'probability']
sum_of_female_prob += 1-name_dict[mm][u'probability']
print "Preliminary Count"
print "Female",sum_of_female_prob
print "Male",sum_of_male_prob
print "Current ratio",float(sum_of_female_prob)/(sum_of_female_prob+sum_of_male_prob)
def runGenderizationScript(file, keyword, exportAll):
#file should be 'input/<filename>', keyword should be '<descriptive keyword>'
log_file = "output/log_"+keyword+".txt"
export_file = "output/names_"+keyword+".csv"
sys.stdout = Logger(log_file)
first_pis, other_pis = extract_pis(file)
if len(other_pis) > 0:
all_pis = np.concatenate((first_pis,other_pis))
else:
all_pis = np.array(first_pis)
first_names = [inner_list[1] for inner_list in all_pis]
deduped_first_names = np.unique(first_names)
print(len(deduped_first_names))
genderized_all_pis = get_genderization(deduped_first_names)
genderized_first_pis = [{u'full_name':" ".join(first_pi),u'name':first_pi[1]} for first_pi in first_pis]
genderized_other_pis = [{u'full_name':" ".join(other_pi),u'name':other_pi[1]} for other_pi in other_pis]
for author in genderized_first_pis:
author.update(genderized_all_pis[author[u'name']])
for author in genderized_other_pis:
author.update(genderized_all_pis[author[u'name']])
for entry in genderized_first_pis:
entry[u'num_pi'] = 1
for entry in genderized_other_pis:
entry[u'num_pi'] = 2
#check for first pi that is also other pi and remove
cleaned_genderize_other_pis = []
for pi_2 in genderized_other_pis:
num_matches = 0
for pi_1 in genderized_first_pis:
if pi_1[u'full_name'] == pi_2[u'full_name']:
num_matches +=1
if num_matches == 0:
cleaned_genderize_other_pis.append(pi_2)
genderized_all_pis = deepcopy(genderized_first_pis)
genderized_all_pis.extend(cleaned_genderize_other_pis)
if exportAll:
export(genderized_all_pis, export_file)
print("===================================================================================")
print("PRELIMINARY COUNTS")
print("Please add in names with no assigned gender by hand and calculate this ratio again!")
print("===================================================================================")
count_out_dict(genderized_all_pis)
#TODO call the main function -- here is an example!
file ='input/SR_22Nov2016_113407_13085397.csv'
keyword = 'electrophysiology'
runGenderizationScript(file, keyword, 1)
Like this:
Like Loading...