{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Example Statistics - MDF Datasets\n", "Example: We want to know how many datasets are in MDF and which datasets have the most records.\n", "\n", "**Note: This example is not kept up-to-date with the latest statistics.**\n", "\n", "If you want the current MDF statistics, you must run this code yourself." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from tqdm import tqdm\n", "import pandas as pd\n", "from mdf_forge.forge import Forge" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "mdf = Forge()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 373/373 [03:21<00:00, 1.85it/s]\n" ] } ], "source": [ "# First, let's search for all the datasets. There are less than 10,000 currently, so `search()` will work fine.\n", "res = mdf.search(\"mdf.resource_type:dataset\", advanced=True)\n", "# Now, let's pull out the source_name, title, and number of records for each dataset.\n", "mdf_resources = []\n", "for r in tqdm(res):\n", " q = \"mdf.resource_type:record AND mdf.source_name:\" + r[\"mdf\"][\"source_name\"]\n", " x, info = mdf.search(q, advanced=True, info=True, limit=0)\n", " mdf_resources.append((r['mdf']['source_name'], r['dc'][\"titles\"][0]['title'], info[\"total_query_matches\"]))\n", "df = pd.DataFrame(mdf_resources, columns=['source_name', 'title', 'num_records'])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of data resources: 373\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
source_nametitlenum_records
372sstein_stein_bandgap_2019Machine learning of optical properties of mate...478111
78oqmdThe Open Quantum Materials Database395348
338stein_bandgap_2019Machine learning of optical properties of mate...180900
75h2o_13Machine-learning approach for one- and two-bod...45482
74ab_initio_solute_databaseHigh-throughput Ab-initio Dilute Solute Diffus...31488
249nist_xps_dbNIST X-ray Photoelectron Spectroscopy Database29189
4jarvisJARVIS - Joint Automated Repository for Variou...26559
6amcsThe American Mineralogist Crystal Structure Da...19842
330w_14Accuracy and transferability of Gaussian appro...9693
76bfcc13Cluster expansion made easy with Bayesian comp...3783
246cipEvaluation and comparison of classical interat...3291
2sluschiSolid and Liquid in Ultra Small Coexistence wi...1618
331surface_crystal_energyData from: Surface energies of elemental crystals1216
5khazana_polymerKhazana (Polymer)1073
327mdr_item_1496Ultrahigh Carbon Steel Micrographs1007
\n", "
" ], "text/plain": [ " source_name \\\n", "372 sstein_stein_bandgap_2019 \n", "78 oqmd \n", "338 stein_bandgap_2019 \n", "75 h2o_13 \n", "74 ab_initio_solute_database \n", "249 nist_xps_db \n", "4 jarvis \n", "6 amcs \n", "330 w_14 \n", "76 bfcc13 \n", "246 cip \n", "2 sluschi \n", "331 surface_crystal_energy \n", "5 khazana_polymer \n", "327 mdr_item_1496 \n", "\n", " title num_records \n", "372 Machine learning of optical properties of mate... 478111 \n", "78 The Open Quantum Materials Database 395348 \n", "338 Machine learning of optical properties of mate... 180900 \n", "75 Machine-learning approach for one- and two-bod... 45482 \n", "74 High-throughput Ab-initio Dilute Solute Diffus... 31488 \n", "249 NIST X-ray Photoelectron Spectroscopy Database 29189 \n", "4 JARVIS - Joint Automated Repository for Variou... 26559 \n", "6 The American Mineralogist Crystal Structure Da... 19842 \n", "330 Accuracy and transferability of Gaussian appro... 9693 \n", "76 Cluster expansion made easy with Bayesian comp... 3783 \n", "246 Evaluation and comparison of classical interat... 3291 \n", "2 Solid and Liquid in Ultra Small Coexistence wi... 1618 \n", "331 Data from: Surface energies of elemental crystals 1216 \n", "5 Khazana (Polymer) 1073 \n", "327 Ultrahigh Carbon Steel Micrographs 1007 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Finally, we can print the data we gathered.\n", "print(\"Number of data resources: {n_datasets}\".format(n_datasets=len(df)))\n", "df.sort_values(by=\"num_records\", ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1230958" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Bonus: How many records are in MDF in total?\n", "df[\"num_records\"].sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }