{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Example Statistics - MDF Datasets\n",
    "Example: We want to know how many datasets are in MDF and which datasets have the most records.\n",
    "\n",
    "**Note: This example is not kept up-to-date with the latest statistics.**\n",
    "\n",
    "If you want the current MDF statistics, you must run this code yourself."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "from mdf_forge.forge import Forge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "mdf = Forge()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 373/373 [03:21<00:00,  1.85it/s]\n"
     ]
    }
   ],
   "source": [
    "# First, let's search for all the datasets. There are less than 10,000 currently, so `search()` will work fine.\n",
    "res = mdf.search(\"mdf.resource_type:dataset\", advanced=True)\n",
    "# Now, let's pull out the source_name, title, and number of records for each dataset.\n",
    "mdf_resources = []\n",
    "for r in tqdm(res):\n",
    "    q = \"mdf.resource_type:record AND mdf.source_name:\" + r[\"mdf\"][\"source_name\"]\n",
    "    x, info = mdf.search(q, advanced=True, info=True, limit=0)\n",
    "    mdf_resources.append((r['mdf']['source_name'], r['dc'][\"titles\"][0]['title'], info[\"total_query_matches\"]))\n",
    "df = pd.DataFrame(mdf_resources, columns=['source_name', 'title', 'num_records'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of data resources: 373\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_name</th>\n",
       "      <th>title</th>\n",
       "      <th>num_records</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>372</th>\n",
       "      <td>sstein_stein_bandgap_2019</td>\n",
       "      <td>Machine learning of optical properties of mate...</td>\n",
       "      <td>478111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>78</th>\n",
       "      <td>oqmd</td>\n",
       "      <td>The Open Quantum Materials Database</td>\n",
       "      <td>395348</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>338</th>\n",
       "      <td>stein_bandgap_2019</td>\n",
       "      <td>Machine learning of optical properties of mate...</td>\n",
       "      <td>180900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75</th>\n",
       "      <td>h2o_13</td>\n",
       "      <td>Machine-learning approach for one- and two-bod...</td>\n",
       "      <td>45482</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74</th>\n",
       "      <td>ab_initio_solute_database</td>\n",
       "      <td>High-throughput Ab-initio Dilute Solute Diffus...</td>\n",
       "      <td>31488</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>249</th>\n",
       "      <td>nist_xps_db</td>\n",
       "      <td>NIST X-ray Photoelectron Spectroscopy Database</td>\n",
       "      <td>29189</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>jarvis</td>\n",
       "      <td>JARVIS - Joint Automated Repository for Variou...</td>\n",
       "      <td>26559</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>amcs</td>\n",
       "      <td>The American Mineralogist Crystal Structure Da...</td>\n",
       "      <td>19842</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>330</th>\n",
       "      <td>w_14</td>\n",
       "      <td>Accuracy and transferability of Gaussian appro...</td>\n",
       "      <td>9693</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76</th>\n",
       "      <td>bfcc13</td>\n",
       "      <td>Cluster expansion made easy with Bayesian comp...</td>\n",
       "      <td>3783</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>246</th>\n",
       "      <td>cip</td>\n",
       "      <td>Evaluation and comparison of classical interat...</td>\n",
       "      <td>3291</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sluschi</td>\n",
       "      <td>Solid and Liquid in Ultra Small Coexistence wi...</td>\n",
       "      <td>1618</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>331</th>\n",
       "      <td>surface_crystal_energy</td>\n",
       "      <td>Data from: Surface energies of elemental crystals</td>\n",
       "      <td>1216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>khazana_polymer</td>\n",
       "      <td>Khazana (Polymer)</td>\n",
       "      <td>1073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>327</th>\n",
       "      <td>mdr_item_1496</td>\n",
       "      <td>Ultrahigh Carbon Steel Micrographs</td>\n",
       "      <td>1007</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   source_name  \\\n",
       "372  sstein_stein_bandgap_2019   \n",
       "78                        oqmd   \n",
       "338         stein_bandgap_2019   \n",
       "75                      h2o_13   \n",
       "74   ab_initio_solute_database   \n",
       "249                nist_xps_db   \n",
       "4                       jarvis   \n",
       "6                         amcs   \n",
       "330                       w_14   \n",
       "76                      bfcc13   \n",
       "246                        cip   \n",
       "2                      sluschi   \n",
       "331     surface_crystal_energy   \n",
       "5              khazana_polymer   \n",
       "327              mdr_item_1496   \n",
       "\n",
       "                                                 title  num_records  \n",
       "372  Machine learning of optical properties of mate...       478111  \n",
       "78                 The Open Quantum Materials Database       395348  \n",
       "338  Machine learning of optical properties of mate...       180900  \n",
       "75   Machine-learning approach for one- and two-bod...        45482  \n",
       "74   High-throughput Ab-initio Dilute Solute Diffus...        31488  \n",
       "249     NIST X-ray Photoelectron Spectroscopy Database        29189  \n",
       "4    JARVIS - Joint Automated Repository for Variou...        26559  \n",
       "6    The American Mineralogist Crystal Structure Da...        19842  \n",
       "330  Accuracy and transferability of Gaussian appro...         9693  \n",
       "76   Cluster expansion made easy with Bayesian comp...         3783  \n",
       "246  Evaluation and comparison of classical interat...         3291  \n",
       "2    Solid and Liquid in Ultra Small Coexistence wi...         1618  \n",
       "331  Data from: Surface energies of elemental crystals         1216  \n",
       "5                                    Khazana (Polymer)         1073  \n",
       "327                 Ultrahigh Carbon Steel Micrographs         1007  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Finally, we can print the data we gathered.\n",
    "print(\"Number of data resources: {n_datasets}\".format(n_datasets=len(df)))\n",
    "df.sort_values(by=\"num_records\", ascending=False).head(15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1230958"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Bonus: How many records are in MDF in total?\n",
    "df[\"num_records\"].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}