Note: As of 2021, Journal Watch has been revamped to include cumulative data. See previous Journal Watch reports here:
Methodology
We have separated authors into three types: first authors, last (and sole) authors and middle authors.
We retrieved names from journal issues using Python and Beautiful Soup while excluding the following sections whenever they appeared:
- For Journal of Neuroscience: “This Week in The Journal”, “Erratum”, “Editorial”, “Acknowledgment of Reviewers”, “Collaborating Reviewers”, “Commentary”
- For Neuron: “Editorial”, “Corrections”, “Obituary”, “Q&A” & “Meeting Report”
- For Nature: “Amendments & Corrections”, “Obituary”
- For ENeuro: “Editorial”, “Erratum”, “Erratum/Corrigendum”, “Acknowledgment of Reviewers”, “Collaborating Reviewers”
We classified retrieved names using Python, the genderize.io API, and an internal database of full names checked manually in the past. When the inferred gender from the API was either below 60% probability or had less than 5 examples, we searched for the author’s gender manually and noted them. When we were not able to easily find a gender, we excluded the author, and noted the number of exclusions and the type of author.
The script also produces summarizing stats for the year in a .csv file. The retrieved data is then put into Tableau to calculate the proportions and produce the interactive visualization.
The full python script is at the bottom of this post, the internal database of manually checked names is here as a .csv file, and the Tableau workbooks and associated datasets can be downloaded in the corner of the visualization.
Suggestions and other comments are welcome.
Base rates
First authors are more likely to be graduate students or postdoctoral fellows. Using our calculations based on registration for the 2018 Annual Meeting of the Society for Neuroscience, 50% of graduate students and postdocs in neuroscience are women.
For last or sole authors we take the base rate to be 31%, the percentage of women faculty at the same meeting. We take the base rate for all authors to be the overall percentage of 43%.
Python script
#To run this script, put in details required at the # Enter details # section
#Tested with python 3.7.3
#Updated Neuron Parsing 12/14/2018
#Dependencies: see imports (genderize, requests, BeutifulSoup4, numpy, lxml)
#Updated 7/30/20 as JournalWatch.py
#imports
import csv
from time import sleep
from genderize import Genderize
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import copy
import os
########################
# Enter details
########################
# Set to "True" to only update summaries after manual name-checking
summarize_only = False
# TODO Replace current month & year in lower case
year = "2020"
month = "july"
# TODO Replace links to JNeuro issues (table of contents page), from https://www.jneurosci.org/content/by/year
jneuro_issues = ['https://www.jneurosci.org/content/40/27'
'https://www.jneurosci.org/content/40/28',
'https://www.jneurosci.org/content/40/29',
'https://www.jneurosci.org/content/40/30']
excluded_list = ["This Week in The Journal",
"Erratum",
"Editorial",
"Acknowledgment of Reviewers",
"Collaborating Reviewers"]
# TODO Replace links to Neuron issues, from https://www.cell.com/neuron/archive
# Exclude articles in section "Editorial", "Correction", "Obituary", "Q&A" & "Meeting Report" in Neuron - paste article name
neuron_issues = ["https://www.cell.com/neuron/issue?pii=S0896-6273(19)X0014-8",
"https://www.cell.com/neuron/issue?pii=S0896-6273(19)X0015-X"]
excluded_articles = ["Golden Anniversary of the Nicotinic Receptor",
"An Ultra-Sensitive Step-Function Opsin for Minimally Invasive Optogenetic Stimulation in Mice and Macaques",
"Retraction Notice to: A Societal Sleep Prescription"]
# TODO Replace link to Nature issues, from https://www.nature.com/neuro/volumes
nature_issues = ['https://www.nature.com/neuro/volumes/23/issues/6']
exclude_sections = ["Amendments & Corrections", "Obituary"] # check that this doesn't have to be author and editor corrections
# TODO Replace link to ENeuro issues every 2 months (table of contents page), from https://www.eneuro.org/content/by/year
# Ignore on months without publication
eneuro_issues = ["https://www.eneuro.org/content/7/4"]
excluded_sections = ["Erratum/Corrigendum",
"Editorial",
"Erratum",
"Acknowledgment of Reviewers",
"Collaborating Reviewers"]
##########################
# Genderization Functions
##########################
def get_genderization(list_of_names, verbose=True):
'''
Input: list of first names
Output: dictionary with first name as key and API output (as dictionary) as value
'''
first_name_dicts = {}
for current_name in list_of_names:
if verbose:
print(current_name)
#get genderization
try:
new_name = (Genderize().get([current_name]))[0]
except:
new_name= {u'count': "", u'gender': "", u'name': current_name, u'probability': ""}
#add name to outputted dictionary
first_name_dicts[current_name] = new_name
return first_name_dicts
def export(genderized_list_of_dicts, output_name):
'''
Inputs:1) final list of dictionaries for each person
2) name for outputted final
Output: None, saves csv file with outputted genderization
'''
#exports a genderized list of dicts to csv file named output_name
with open(output_name, 'w', encoding='utf-8') as csvfile:
fieldnames = [u'count', u'gender', u'name', u'probability', u'full_name', u'number']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for jj in range(len(genderized_list_of_dicts)):
writer.writerow(genderized_list_of_dicts[jj])
def paperGenderization(full_names, num_auth_indicator,journal,month, verbose = True):
'''
Inputs:1) list with full names from journal script
2) list of corresponding number indicating author positions (from journal script)
* Currently: 1 for first author, -1 for sole or last author, 0 otherwise
3) name of journal
4) month of journal
Output: None, saves csv file with outputted genderization
'''
#filter out blanks
full_names = [full_name for full_name in full_names if full_name != '']
full_names = [full_name.strip('",') for full_name in full_names]
#create output file name
keyword = journal+"_"+month
export_file = "names_"+keyword+".csv"
files.append(export_file)
#get genderization of deduped first names
first_names = [inner_list.split()[0] for inner_list in full_names]
deduped_first_names = np.unique(first_names)
if verbose:
print(len(deduped_first_names))
genderized_first_names = get_genderization(deduped_first_names)
#start creating dictionary with all full names
genderized_all_names = [{u'full_name':full_name,
u'name':full_name.split()[0],
u'number':num_auth_indicator[name_idx]}
for name_idx, full_name in enumerate(full_names)]
#join the deduped first name genderization with full names dict
for author in genderized_all_names:
author.update(genderized_first_names[author[u'name']])
#export as csv
export(genderized_all_names, export_file)
##############################
# Journal Scraping Functions
##############################
def getJNeuroAuthors(issues, excluded_list):
'''
Inputs:1) List of links to relevant issues
2) List of excluded journal sections
Output:1) Names of authors
2) Numeric indicator for author positions (1 for first author, -1 for sole or last author, 0 otherwise)
'''
all_authors = []
all_nums = []
for idx_issue, issue in enumerate(issues):
if idx_issue > 0:
sleep(2.0) #cautious delay
r = requests.get(issue)
data_soup = bs(r.text, "lxml")
parsed_dict = {}
sections = data_soup.find_all("div",attrs={"class": "issue-toc-section"})
for section in sections:
if section.h2.get_text() not in excluded_list:
papers = section.find_all(attrs={"class": "highwire-cite-authors"})
for paper in papers:
auth_list = []
max_auth_reached = 0
curr_auth = 0
while (max_auth_reached==0):
auth_el = paper.find(attrs={"data-delta" : str(curr_auth)})
if auth_el is None:
max_auth_reached = 1
else:
auth_list.append(auth_el.get_text())
curr_auth += 1
all_authors.extend(auth_list)
num_authors = np.zeros(len(auth_list))
if len(auth_list)>1:
num_authors[0] = 1
num_authors[-1] = -1
elif len(auth_list) == 1:
num_authors[0] = -1
all_nums.extend(num_authors)
return all_authors, all_nums
def getNeuronAuthors(issues,excluded_articles):
'''
Inputs:1) List of links to relevant issues
2) List of excluded article names
Output:1) Names of authors
2) Numeric indicator for author positions (1 for first author, -1 for sole or last author, 0 otherwise)
'''
all_authors = []
all_nums = []
for idx_issue, issue in enumerate(issues):
if idx_issue > 0:
sleep(2.0) #cautious delay
r = requests.get(issue)
data_soup = bs(r.text, "lxml")
articles = data_soup.find_all(attrs={"class": "toc__item"})
for article in articles:
title = article.a.get_text()
if title not in excluded_articles:
names = article.find_all(attrs={"class": "loa__item"})
auth_list = [name.get_text() for name in names]
all_authors.extend(auth_list)
num_authors = np.zeros(len(auth_list))
if len(auth_list)>1:
num_authors[0] = 1
num_authors[-1] = -1
elif len(auth_list) == 1:
num_authors[0] = -1
all_nums.extend(num_authors)
return all_authors, all_nums
def getNatureNeuroAuthors(issues,excluded_sections):
'''
Inputs:1) List of links to relevant issues
2) List of excluded journal sections
Output:1) Names of authors
2) Numeric indicator for author positions (1 for first author, -1 for sole or last author, 0 otherwise)
'''
all_authors = []
all_nums = []
for idx_issue, issue in enumerate(issues):
if idx_issue > 0:
sleep(2.0) #cautious delay
r = requests.get(issue)
data_soup = bs(r.text, "lxml")
articles = data_soup.find_all("article")
for article in articles:
if((article.find(attrs={"data-test":"article.type"}).get_text()) not in excluded_sections):
names = article.find_all(attrs={"itemprop": "name"})
auth_list = [name.get_text() for name in names]
all_authors.extend(auth_list)
num_authors = np.zeros(len(auth_list))
if len(auth_list)>1:
num_authors[0] = 1
num_authors[-1] = -1
elif len(auth_list) == 1:
num_authors[0] = -1
all_nums.extend(num_authors)
return all_authors, all_nums
##############################
# Journal Scraping Functions
##############################
def getENeuroAuthors(issues, excluded_list):
'''
Inputs:1) List of links to relevant issues
2) List of excluded journal sections
Output:1) Names of authors
2) Numeric indicator for author positions (1 for first author, -1 for sole or last author, 0 otherwise)
'''
all_authors = []
all_nums = []
for idx_issue, issue in enumerate(issues):
if idx_issue > 0:
sleep(2.0) #cautious delay
r = requests.get(issue)
data_soup = bs(r.text, "lxml")
parsed_dict = {}
sections = data_soup.find_all("div",attrs={"class": "issue-toc-section"})
for section in sections:
if section.h2.get_text() not in excluded_list:
papers = section.find_all(attrs={"class": "highwire-citation-authors"})
for paper in papers:
auth_list = []
max_auth_reached = 0
curr_auth = 0
while (max_auth_reached==0):
auth_el = paper.find(attrs={"data-delta" : str(curr_auth)})
if auth_el is None:
max_auth_reached = 1
else:
auth_list.append(auth_el.get_text())
curr_auth += 1
all_authors.extend(auth_list)
num_authors = np.zeros(len(auth_list))
if len(auth_list)>1:
num_authors[0] = 1
num_authors[-1] = -1
elif len(auth_list) == 1:
num_authors[0] = -1
all_nums.extend(num_authors)
return all_authors, all_nums
##############################
# Check names against internal database for manually identified names
##############################
# File names for cross-checking & removing duplicates
files = []
csv_names = []
def crosscheck(file_names):
# Import source database into dict
reader_gendered = csv.DictReader(open('gendered_names.csv', encoding='iso-8859-1'))
gendered_database = {}
for row in reader_gendered:
for column, value in row.items():
gendered_database.setdefault(column, []).append(value)
total_count = 0
# Check each file
for name in file_names:
print(name)
count = 0
# Import csv file
reader_journal = csv.DictReader(open(name, encoding='iso-8859-1'))
journal = {}
for row in reader_journal:
for column, value in row.items():
journal.setdefault(column, []).append(value)
for index, full_name in enumerate(journal['full_name']):
journal['full_name'][index] = full_name.strip(',"')
# Check names with no gender against names in database, update gender, then export dict to same csv file
for index, value in enumerate(journal['gender']):
if value == '':
for idx, val in enumerate(gendered_database['full_name']):
if journal['full_name'][index] == val:
journal['gender'][index] = gendered_database['gender'][idx]
journal['probability'][index] = gendered_database['probability'][idx]
print("%s is %s" % (val, gendered_database['gender'][idx]))
count += 1
journal_df = pd.DataFrame(data=journal)
journal_df = journal_df.drop_duplicates(subset=['full_name'])
journal_df.to_csv(name, index=False)
print("Updated %d values." % count)
total_count += count
print("Updated a total of %d values" % total_count)
# Convert month from text to index (for sorting)
def month_idx(month):
months = ['january', 'february', 'march', 'april', 'may', 'june', 'july',
'august', 'september', 'october', 'november', 'december']
return months.index(month.lower().strip()) + 1
if __name__ == "__main__":
##########
# Calls
##########
# Retrieving names and initial genderizing
if not summarize_only:
# JNeuro
authors_jneuro, nums_jneuro = getJNeuroAuthors(jneuro_issues,excluded_list)
paperGenderization(authors_jneuro, nums_jneuro, "jneuro", month)
# Neuron
authors_neuron, nums_neuron = getNeuronAuthors(neuron_issues,excluded_articles)
paperGenderization(authors_neuron, nums_neuron, "neuron", month)
# Nature
authors_natneuro, nums_natneuro = getNatureNeuroAuthors(nature_issues,exclude_sections)
paperGenderization(authors_natneuro, nums_natneuro, "natneuro", month)
# ENeuro
if month.lower() in ['january', 'march', 'may', 'july', 'september', 'november']:
authors_eneuro, nums_eneuro = getENeuroAuthors(eneuro_issues, excluded_sections)
paperGenderization(authors_eneuro, nums_eneuro, "eneuro", month)
# Check against internal database
crosscheck(files)
###################
# Calculate summary statistics about each month in the year
###################
# Retrieve file names in the directory
current_dir = os.getcwd()
for root, dirs, files in os.walk(current_dir):
for file in files:
if file.startswith('names_'):
csv_names.append(file)
eneuro = {'month_index': [], 'month': [], 'male_first': [], 'male_last': [], 'male_all': [], 'female_first': [],
'female_last': [], 'female_all': [], 'p_first': [], 'p_last': [], 'p_all': [], 'excluded_first': [],
'excluded_last': [], 'excluded_all': []}
jneuro = copy.deepcopy(eneuro)
natneuro = copy.deepcopy(eneuro)
neuron = copy.deepcopy(eneuro)
# check each file
for name in csv_names:
print(name)
# read & clean dataframe
df = pd.read_csv(name, encoding='iso-8859-1')
df.fillna("", inplace=True)
df['probability'] = pd.to_numeric(df['probability'])
df['count'] = pd.to_numeric(df['count'])
df['number'] = pd.to_numeric(df['number'])
df['count'].fillna(0)
df['probability'].fillna(0.0)
df['gender'].fillna('')
df = df.drop_duplicates()
df['index_col'] = df.index # to keep duplicates
# remove names that doesn't meet criteria for reliability
# probability <= 0.6, or probability > 0.6 but count < 5
excluded = df[(df['probability'] <= 0.6) |
((df['count'] < 5) & (df['count'] > 0) & (df['probability'] > 0.6) & (df['probability'] < 1.0)) |
(df['gender'] == "")]
df = pd.concat([df, excluded]).drop_duplicates(keep=False)
# calculate statistics
title = name[5:-4] # temp var for name
journal = globals()[title.strip('_').split('_')[0]] # name of journal
month = title.strip('_').split('_')[1]
month_index = month_idx(month)
male_first = len(df[(df['gender'] == "male") & (df['number'] == 1.0)])
male_last = len(df[(df['gender'] == "male") & (df['number'] == -1.0)])
male_all = df['gender'].value_counts()['male']
female_first = len(df[(df['gender'] == "female") & (df['number'] == 1.0)])
female_last = len(df[(df['gender'] == "female") & (df['number'] == -1.0)])
female_all = df['gender'].value_counts()['female']
p_first = female_first / (female_first + male_first)
p_last = female_last / (female_last + male_last)
p_all = female_all / (female_all + male_all)
excluded_first = len(excluded[excluded['number'] == 1.0].index)
excluded_last = len(excluded[excluded['number'] == -1.0].index)
excluded_all = len(excluded)
# insert data into log
for col in ['month_index', 'month', 'male_first', 'male_last', 'male_all', 'female_first', 'female_last',
'female_all', 'p_first', 'p_last', 'p_all', 'excluded_first', 'excluded_last', 'excluded_all']:
journal[col].append(globals()[col])
print("Done")
combined = pd.DataFrame()
# Export to individual csv
for neuro in ['eneuro', 'jneuro', 'natneuro', 'neuron']:
df_neuro = pd.DataFrame(data=globals()[neuro])
df_neuro.sort_values(by=['month_index'], inplace=True)
df_neuro.to_csv(neuro + '_' + year + '.csv', float_format='%.2f', index=False)
df_neuro['journal'] = neuro
combined = pd.concat([combined, df_neuro])
# Export to main csv
combined.to_csv('combined_' + year + '.csv', float_format='%.2f', index=False)