Plot of history of deposits of x-ray- and cryoEM structural data in the RSC Protein Data Bank
Matlab code 

% Program taking raw x-ray and cryo data files mined from the RSC Protein 

% Data Bank and splitting them into separate files 

% Latest version 2021.06.04

 

% It has become apparent that even well into a given year, the deposits in 

% the PDB from the previous year may not be complete and this can continue 

% to increase over several years. It is therefore recommended to do a full 

% data mine using the below procedure, starting from the historically first 

% deposit. Note also that in 2019, PDB had igs first x-ray entry in 1972, 

% though in 2021, this entry seems to have disappeared and the first entry 

% is at 1976. It is therefore important to keep an eye on these (somewhat 

% inexplicable) changes. My approach is simply to take the most recent 

% data. 

% This is the procedure I use to mine the data for this parsing program 

% xrayCryoDataParse.m. 

% • Go to https://www.rcsb.org/search 

% • Select methods->experimental methods and enter either x-ray diffraction 

%   or electron microscopy 

% • Add field deposition->deposit date and enter first and last dates 

%   (range (upper incl.))

% • Press return 

% • Click on tabular report-> Create custom report 

% • Select deposit date, structure molecular weight, resolution IN THAT 

%   ORDER so columns are date, MW, resn in the CSV file then click on 


% • Click on download CSV file. If there are more entries than 2500 for the 

%   search you have done, they are downloadable as chunks of 2500 at a 

%   time. In this case, you need to concatenate them. Do this by: 

%   o   Collect all the CSV files in a single folder 

%   o   Open terminal in your mac 

%   o   Change directory to this folder, e.g. cd Desktop/temp/

%   o   Type in: cat *.csv >yourFilename.csv 

% • Open this CSV file in numbers 

% • Go to the column header of the deposition date – this is Column B 


% • Click on first date entry (it should be the oldest) then scroll to 

%   bottom row and click on the last resolution entry while holding down 

%   shift key – this should highlight all three columns. Press cmd-C (copy)

% • Open a new file in TexShop and press cmd-V (paste) – all the data 

%   should now be in the tex file. 

% • Now you need to remove all dashes in the dates (e.g. 2020-05-05 changes 

%   to 2020  05  05, that is, with tabs in between. Spaces don’t work as 

%   matlab then thinks it is a text element). To do this simply press cmd-F 



%   a blank tex file, enter a tab, then copy-paste it into the field 

% • Save the file with the appropriate name 

% • There might be some rows with multiple resolutions separated by a 

%   comma. I search for these with a comma and delete the second entry. Not 

%   very clean but hey-ho 

% • The relevant columns are 

%   o   1: The year 

%   o   4: The molecular weight in kDa 

%   o   5: The resolution in Angstroms 

 

clear all; close all

 

inDataX = importdata('xrayyears/allXrayData.dat'); 

inDataC = importdata('cryoyears/allCryoData.dat'); 

% column 1 = year; column 4 = MW [kDa]; column 5 = resolution [AA]

dataSizeX = size(inDataX); % Number of x-ray data points x 5 columns

dataSizeC = size(inDataC); % Number of cryo data points x 5 columns

 

NdataX = dataSizeX(1);           % Number of x-ray data points 

NdataC = dataSizeC(1);           % Number of cryo data points 

yrX = inDataX(1:NdataX,1:1);     % Year x-ray structure was deposited 

MWX = inDataX(1:NdataX,4:4);     % Molecular weight of x-ray structure

resnX = inDataX(1:NdataX,5:5);   % Resolution in Angstroms of x-ray structure 

yrC = inDataC(1:NdataC,1:1);     % Year cryo structure was deposited 

MWC = inDataC(1:NdataC,4:4);     % Molecular weight of cryo structure

resnC = inDataC(1:NdataC,5:5);   % Resolution in Angstroms of cryo structure 

 

str1X = 'xrayyears/year';        % Created files in directory 'xrayyears' 

str1C = 'cryoyears/year';        % Created files in directory 'cryoyears' 

str3 = '.dat'

 

yearX = inDataX(1,1);   % First year in x-ray data file 

yearC = inDataC(1,1);   % First year in cryo data file 

str2X = num2str(yearX); 

str2C = num2str(yearC); 

 

% Now split x-ray data into files according to the deposition year

for i = 1:NdataX

    str2 = num2str(yearX); 

    yearFileName = strcat(str1X,str2,str3); 

    fileID = fopen(yearFileName,'a+');

    

    if (yrX(i) == yearX) 

        fprintf(fileID,'%10.2f %10.3f\n',MWX(i),resnX(i)); 

        fclose(fileID); 

    elseif (yrX(i) > yearX)

        yearX = yrX(i); 

        fclose(fileID); 

        str2 = num2str(yearX); 

        yearFileName = strcat(str1X,str2,str3); 

        fileID = fopen(yearFileName,'a+'); 

        fprintf(fileID,'%10.2f %10.3f\n',MWX(i),resnX(i)); 

        fclose(fileID); 

    end

end 

 

% Now split cryo data into files according to the deposition year

for i = 1:NdataC

    str2 = num2str(yearC); 

    yearFileName = strcat(str1C,str2,str3); 

    fileID = fopen(yearFileName,'a+');

    

    if (yrC(i) == yearC) 

        fprintf(fileID,'%10.2f %10.3f\n',MWC(i),resnC(i)); 

        fclose(fileID); 

    elseif (yrC(i) > yearC)

        yearC = yrC(i); 

        fclose(fileID); 

        str2 = num2str(yearC); 

        yearFileName = strcat(str1C,str2,str3); 

        fileID = fopen(yearFileName,'a+'); 

        fprintf(fileID,'%10.2f %10.3f\n',MWC(i),resnC(i)); 

        fclose(fileID); 

    end

end 

% Program to plot out historical deposits of x-ray and cryoEM structures in

% the RSC Protein Data Bank. It uses data prepared by the program

% xrayCryoDataParse.m 

 

clear; close all;

 

vid = VideoWriter('xraysCryoHistory.mp4','MPEG-4');

vid.FrameRate = 5;    % Default 30

vid.Quality = 100;    % Default 75

open(vid);

figure('units','pixels','position',[0 0 1600 900],'ToolBar','none');

set(0,'defaultfigurecolor',[1 1 1]);    % Make background colour white

 

c1 = [0.85,0.75,0];             % gold for x-ray data

c2 = [0.215,0.267,0.578];       % dark blue for cryoEM data

 

str1X = 'xrayyears/year';       % Created files in directory 'xrayyears'

str1C = 'cryoyears/year';       % Created files in directory 'cryoyears'

str3 = '.dat';

 

str4 = 'Resolution [';

str5 = char(197);               % Ascii code for Angstrom symbol

str6 = ']';

strTot = [str4,str5,str6];

 

sumCryo = 0;

sumXrays = 0;

 

for i = 1960:2050 % From a date well before any surprise additions from PDB

    % to a date well into the future...

    str2 = num2str(i); % Year label

    xyearFileName = strcat(str1X,str2,str3);

    % Check to see if file exists or not

    if isfile(xyearFileName) % File exists.

        xyearData = importdata(xyearFileName);

        xX = xyearData(:,1); % MW

        yX = xyearData(:,2); % Resolution

        uistack(scatter(xX,yX,10,c1,'filled','LineWidth',1,...

            'MarkerFaceAlpha', 0.8),'bottom');

        sumXrays = sumXrays+size(xX,1);

        hold on

    end

    str7 = num2str(sumXrays);

    

    cyearFileName = strcat(str1C,str2,str3);

    if isfile(cyearFileName) % File exists.

        cyearData = importdata(cyearFileName);

        xC = cyearData(:,1);    % MW

        yC = cyearData(:,2);    % Resolution

        uistack(scatter(xC,yC,10,c2,'filled','LineWidth',1),'top');

        sumCryo = sumCryo + size(xC,1);

    end

    str8 = num2str(sumCryo);

    

    set(gca,'FontName','Helvetica','fontsize',25);

    set(gca,'TickLength',[0.016, 2]);

    set(gca,'xscale','log');

    set(gca,'yscale','log');

    set(gca,'linewidth',2);

    xlim([0.1 1e5]);

    ylim([0.4 1e2]);

    xlabel('Molecular weight [kDa]');

    ylabel(strTot);

    box on

    str9 = 'x-ray deposits = ';

    str10 = 'cryo deposits = ';

    

    if isfile(cyearFileName) || isfile(xyearFileName)

        hText1 = text(1e3, 1, str2, 'FontSize',25);

        hText2 = text(0.3, 40, [str9,str7], 'FontSize',25);

        hText3 = text(0.3, 25, [str10,str8], 'FontSize',25);

        frame = getframe(gcf);

        writeVideo(vid,frame);

        hold on

        delete(hText1);

        delete(hText2);

        delete(hText3);

    end

end

 

% Output the movie as an mpg file

close(vid);