Plot of history of deposits of x-ray- and cryoEM structural data in the RSC Protein Data Bank
Matlab code
% Program taking raw x-ray and cryo data files mined from the RSC Protein
% Data Bank and splitting them into separate files
% Latest version 2021.06.04
% It has become apparent that even well into a given year, the deposits in
% the PDB from the previous year may not be complete and this can continue
% to increase over several years. It is therefore recommended to do a full
% data mine using the below procedure, starting from the historically first
% deposit. Note also that in 2019, PDB had igs first x-ray entry in 1972,
% though in 2021, this entry seems to have disappeared and the first entry
% is at 1976. It is therefore important to keep an eye on these (somewhat
% inexplicable) changes. My approach is simply to take the most recent
% data.
% This is the procedure I use to mine the data for this parsing program
% xrayCryoDataParse.m.
%
% • Go to https://www.rcsb.org/search
% • Select methods->experimental methods and enter either x-ray diffraction
% or electron microscopy
% • Add field deposition->deposit date and enter first and last dates
% (range (upper incl.))
% • Press return
% • Click on tabular report-> Create custom report
% • Select deposit date, structure molecular weight, resolution IN THAT
% ORDER so columns are date, MW, resn in the CSV file then click on
% • Click on download CSV file. If there are more entries than 2500 for the
% search you have done, they are downloadable as chunks of 2500 at a
% time. In this case, you need to concatenate them. Do this by:
% o Collect all the CSV files in a single folder
% o Open terminal in your mac
% o Change directory to this folder, e.g. cd Desktop/temp/
% o Type in: cat *.csv >yourFilename.csv
% • Open this CSV file in numbers
% • Go to the column header of the deposition date – this is Column B
% • Click on first date entry (it should be the oldest) then scroll to
% bottom row and click on the last resolution entry while holding down
% shift key – this should highlight all three columns. Press cmd-C (copy)
% • Open a new file in TexShop and press cmd-V (paste) – all the data
% should now be in the tex file.
% • Now you need to remove all dashes in the dates (e.g. 2020-05-05 changes
% to 2020 05 05, that is, with tabs in between. Spaces don’t work as
% matlab then thinks it is a text element). To do this simply press cmd-F
% a blank tex file, enter a tab, then copy-paste it into the field
% • Save the file with the appropriate name
% • There might be some rows with multiple resolutions separated by a
% comma. I search for these with a comma and delete the second entry. Not
% very clean but hey-ho
% • The relevant columns are
% o 1: The year
% o 4: The molecular weight in kDa
% o 5: The resolution in Angstroms
clear all; close all;
inDataX = importdata('xrayyears/allXrayData.dat');
inDataC = importdata('cryoyears/allCryoData.dat');
% column 1 = year; column 4 = MW [kDa]; column 5 = resolution [AA]
dataSizeX = size(inDataX); % Number of x-ray data points x 5 columns
dataSizeC = size(inDataC); % Number of cryo data points x 5 columns
NdataX = dataSizeX(1); % Number of x-ray data points
NdataC = dataSizeC(1); % Number of cryo data points
yrX = inDataX(1:NdataX,1:1); % Year x-ray structure was deposited
MWX = inDataX(1:NdataX,4:4); % Molecular weight of x-ray structure
resnX = inDataX(1:NdataX,5:5); % Resolution in Angstroms of x-ray structure
yrC = inDataC(1:NdataC,1:1); % Year cryo structure was deposited
MWC = inDataC(1:NdataC,4:4); % Molecular weight of cryo structure
resnC = inDataC(1:NdataC,5:5); % Resolution in Angstroms of cryo structure
str1X = 'xrayyears/year'; % Created files in directory 'xrayyears'
str1C = 'cryoyears/year'; % Created files in directory 'cryoyears'
str3 = '.dat';
yearX = inDataX(1,1); % First year in x-ray data file
yearC = inDataC(1,1); % First year in cryo data file
str2X = num2str(yearX);
str2C = num2str(yearC);
% Now split x-ray data into files according to the deposition year
for i = 1:NdataX
str2 = num2str(yearX);
yearFileName = strcat(str1X,str2,str3);
fileID = fopen(yearFileName,'a+');
if (yrX(i) == yearX)
fprintf(fileID,'%10.2f %10.3f\n',MWX(i),resnX(i));
fclose(fileID);
elseif (yrX(i) > yearX)
yearX = yrX(i);
fclose(fileID);
str2 = num2str(yearX);
yearFileName = strcat(str1X,str2,str3);
fileID = fopen(yearFileName,'a+');
fprintf(fileID,'%10.2f %10.3f\n',MWX(i),resnX(i));
fclose(fileID);
end
end
% Now split cryo data into files according to the deposition year
for i = 1:NdataC
str2 = num2str(yearC);
yearFileName = strcat(str1C,str2,str3);
fileID = fopen(yearFileName,'a+');
if (yrC(i) == yearC)
fprintf(fileID,'%10.2f %10.3f\n',MWC(i),resnC(i));
fclose(fileID);
elseif (yrC(i) > yearC)
yearC = yrC(i);
fclose(fileID);
str2 = num2str(yearC);
yearFileName = strcat(str1C,str2,str3);
fileID = fopen(yearFileName,'a+');
fprintf(fileID,'%10.2f %10.3f\n',MWC(i),resnC(i));
fclose(fileID);
end
end
% Program to plot out historical deposits of x-ray and cryoEM structures in
% the RSC Protein Data Bank. It uses data prepared by the program
% xrayCryoDataParse.m
clear; close all;
vid = VideoWriter('xraysCryoHistory.mp4','MPEG-4');
vid.FrameRate = 5; % Default 30
vid.Quality = 100; % Default 75
open(vid);
figure('units','pixels','position',[0 0 1600 900],'ToolBar','none');
set(0,'defaultfigurecolor',[1 1 1]); % Make background colour white
c1 = [0.85,0.75,0]; % gold for x-ray data
c2 = [0.215,0.267,0.578]; % dark blue for cryoEM data
str1X = 'xrayyears/year'; % Created files in directory 'xrayyears'
str1C = 'cryoyears/year'; % Created files in directory 'cryoyears'
str3 = '.dat';
str4 = 'Resolution [';
str5 = char(197); % Ascii code for Angstrom symbol
str6 = ']';
strTot = [str4,str5,str6];
sumCryo = 0;
sumXrays = 0;
for i = 1960:2050 % From a date well before any surprise additions from PDB
% to a date well into the future...
str2 = num2str(i); % Year label
xyearFileName = strcat(str1X,str2,str3);
% Check to see if file exists or not
if isfile(xyearFileName) % File exists.
xyearData = importdata(xyearFileName);
xX = xyearData(:,1); % MW
yX = xyearData(:,2); % Resolution
uistack(scatter(xX,yX,10,c1,'filled','LineWidth',1,...
'MarkerFaceAlpha', 0.8),'bottom');
sumXrays = sumXrays+size(xX,1);
hold on
end
str7 = num2str(sumXrays);
cyearFileName = strcat(str1C,str2,str3);
if isfile(cyearFileName) % File exists.
cyearData = importdata(cyearFileName);
xC = cyearData(:,1); % MW
yC = cyearData(:,2); % Resolution
uistack(scatter(xC,yC,10,c2,'filled','LineWidth',1),'top');
sumCryo = sumCryo + size(xC,1);
end
str8 = num2str(sumCryo);
set(gca,'FontName','Helvetica','fontsize',25);
set(gca,'TickLength',[0.016, 2]);
set(gca,'xscale','log');
set(gca,'yscale','log');
set(gca,'linewidth',2);
xlim([0.1 1e5]);
ylim([0.4 1e2]);
xlabel('Molecular weight [kDa]');
ylabel(strTot);
box on
str9 = 'x-ray deposits = ';
str10 = 'cryo deposits = ';
if isfile(cyearFileName) || isfile(xyearFileName)
hText1 = text(1e3, 1, str2, 'FontSize',25);
hText2 = text(0.3, 40, [str9,str7], 'FontSize',25);
hText3 = text(0.3, 25, [str10,str8], 'FontSize',25);
frame = getframe(gcf);
writeVideo(vid,frame);
hold on
delete(hText1);
delete(hText2);
delete(hText3);
end
end
% Output the movie as an mpg file
close(vid);