{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compare jobs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from arche import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = Arche(source=\"235801/1/15\", target=\"235801/1/14\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's use the schema from Basics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "a.schema = {\n",
    "    \"$schema\": \"http://json-schema.org/draft-07/schema#\",\n",
    "    \"definitions\": {\n",
    "        \"float\": {\n",
    "            \"pattern\": \"^-?[0-9]+\\\\.[0-9]{2}$\"\n",
    "        },\n",
    "        \"url\": {\n",
    "            \"pattern\": \"^https?://(www\\\\.)?[a-z0-9.-]*\\\\.[a-z]{2,}([^<>%\\\\x20\\\\x00-\\\\x1f\\\\x7F]|%[0-9a-fA-F]{2})*$\"\n",
    "        }\n",
    "    },\n",
    "    \"additionalProperties\": False,\n",
    "    \"type\": \"object\",\n",
    "    \"properties\": {\n",
    "        \"category\": {\"type\": \"string\", \"tag\": [\"category\"]},\n",
    "        \"price\": {\"type\": \"string\", \"pattern\": \"^£\\d{2}.\\d{2}$\"},\n",
    "        \"_type\": {\"type\": \"string\"},\n",
    "        \"description\": {\"type\": \"string\"},\n",
    "        \"title\": {\"type\": \"string\", \"tag\": [\"unique\"]},\n",
    "        \"_key\": {\"type\": \"string\"}\n",
    "    },\n",
    "    \"required\": [\n",
    "        \"_key\",\n",
    "        \"_type\",\n",
    "        \"category\",\n",
    "        \"description\",\n",
    "        \"price\",\n",
    "        \"title\"\n",
    "    ]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c1d74a54989846aa970212879c955d57",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='JSON Schema Validation', max=1000, style=ProgressStyle(descri…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Job Outcome:\n",
      "\tFinished\n",
      "\n",
      "Job Errors:\n",
      "\tNo errors\n",
      "\n",
      "Responses Per Item Ratio:\n",
      "\tNumber of responses / Number of scraped items - 1.05\n",
      "\n",
      "Total Scraped Items:\n",
      "\tSame number of items\n",
      "\n",
      "Compare Runtime:\n",
      "\tSimilar or better runtime - 0:04:15.526000 and 0:04:46.466000\n",
      "\n",
      "Finish Time:\n",
      "\u001b[33m\t16 day(s) difference between 2 jobs\u001b[0m\n",
      "\n",
      "Fields Coverage:\n",
      "\t0 totally empty field(s)\n",
      "\n",
      "Boolean Fields:\n",
      "\tNo fields to compare\n",
      "\n",
      "JSON Schema Validation:\n",
      "\u001b[31m\t1000 items were checked, 1 error(s)\u001b[0m\n",
      "\n",
      "Tags:\n",
      "\tcategory, unique\n",
      "\n",
      "Compare Price Was And Now:\n",
      "\tproduct_price_field or product_price_was_field tags were not found in schema\n",
      "\n",
      "Uniqueness:\n",
      "\u001b[31m\t'title' contains 1 duplicated value(s)\u001b[0m\n",
      "\n",
      "Duplicated Items:\n",
      "\t'name_field' and 'product_url_field' tags were not found in schema\n",
      "\n",
      "Coverage For Scraped Categories:\n",
      "\t50 categories in 'category'\n",
      "\n",
      "Compare Scraped Categories:\n",
      "\tSimilar coverage per category with 10% tolerance\n",
      "\tSame categories: 50; new categories: 0; missing categories: 0\n",
      "\n",
      "Compare Prices For Same Urls:\n",
      "\tproduct_url_field tag is not set\n",
      "\n",
      "Compare Names Per Url:\n",
      "\tproduct_url_field tag is not set\n",
      "\n",
      "Compare Prices For Same Names:\n",
      "\tname_field tag is not set\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "RULE: Fields Coverage\n",
      "(1 message(s))\n",
      "\n",
      "             Values Count  Percent\n",
      "Field                             \n",
      "description           998       99\n",
      "_key                 1000      100\n",
      "_type                1000      100\n",
      "category             1000      100\n",
      "price                1000      100\n",
      "title                1000      100\n",
      "\n",
      "RULE: JSON Schema Validation\n",
      "(1 message(s))\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "2 items affected - description is not of type 'string': <a href='https://app.scrapinghub.com/p/235801/1/15/item/162'>162</a> <a href='https://app.scrapinghub.com/p/235801/1/15/item/980'>980</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<br>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "RULE: Uniqueness\n",
      "(1 message(s))\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "2 items affected - same 'The Star-Touched Queen' title: <a href='https://app.scrapinghub.com/p/235801/1/15/item/220'>220</a> <a href='https://app.scrapinghub.com/p/235801/1/15/item/396'>396</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<br>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "RULE: Coverage For Scraped Categories\n",
      "(1 message(s))\n",
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.plotly.v1+json": {
       "config": {
        "linkText": "Export to plot.ly",
        "plotlyServerURL": "https://plot.ly",
        "showLink": true
       },
       "data": [
        {
         "marker": {
          "color": "rgba(226, 74, 51, 0.6)",
          "line": {
           "color": "rgba(226, 74, 51, 1.0)",
           "width": 1
          }
         },
         "name": "category",
         "orientation": "h",
         "text": "",
         "type": "bar",
         "uid": "7a351bdc-6be5-45e3-acd9-c13610506ce3",
         "x": [
          152,
          110,
          75,
          67,
          65,
          54,
          48,
          35,
          32,
          30,
          29,
          26,
          19,
          19,
          18,
          17,
          17,
          16,
          14,
          13,
          12,
          11,
          11,
          11,
          10,
          9,
          8,
          7,
          7,
          6,
          6,
          6,
          5,
          5,
          5,
          4,
          3,
          3,
          3,
          2,
          1,
          1,
          1,
          1,
          1,
          1,
          1,
          1,
          1,
          1
         ],
         "y": [
          "Default",
          "Nonfiction",
          "Sequential Art",
          "Add a comment",
          "Fiction",
          "Young Adult",
          "Fantasy",
          "Romance",
          "Mystery",
          "Food and Drink",
          "Childrens",
          "Historical Fiction",
          "Poetry",
          "Classics",
          "History",
          "Horror",
          "Womens Fiction",
          "Science Fiction",
          "Science",
          "Music",
          "Business",
          "Travel",
          "Thriller",
          "Philosophy",
          "Humor",
          "Autobiography",
          "Art",
          "Psychology",
          "Religion",
          "Christian Fiction",
          "New Adult",
          "Spirituality",
          "Biography",
          "Sports and Games",
          "Self Help",
          "Health",
          "Politics",
          "Contemporary",
          "Christian",
          "Historical",
          "Cultural",
          "Erotica",
          "Novels",
          "Crime",
          "Parenting",
          "Adult Fiction",
          "Academic",
          "Suspense",
          "Paranormal",
          "Short Stories"
         ]
        }
       ],
       "layout": {
        "yaxis": {
         "automargin": true,
         "side": "right"
        }
       }
      },
      "text/html": [
       "<div id=\"46c47489-6a6a-402d-889e-625353e4759a\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";\n",
       "if (document.getElementById(\"46c47489-6a6a-402d-889e-625353e4759a\")) {\n",
       "    Plotly.newPlot(\"46c47489-6a6a-402d-889e-625353e4759a\", [{\"marker\": {\"color\": \"rgba(226, 74, 51, 0.6)\", \"line\": {\"color\": \"rgba(226, 74, 51, 1.0)\", \"width\": 1}}, \"name\": \"category\", \"orientation\": \"h\", \"text\": \"\", \"x\": [152, 110, 75, 67, 65, 54, 48, 35, 32, 30, 29, 26, 19, 19, 18, 17, 17, 16, 14, 13, 12, 11, 11, 11, 10, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 4, 3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"y\": [\"Default\", \"Nonfiction\", \"Sequential Art\", \"Add a comment\", \"Fiction\", \"Young Adult\", \"Fantasy\", \"Romance\", \"Mystery\", \"Food and Drink\", \"Childrens\", \"Historical Fiction\", \"Poetry\", \"Classics\", \"History\", \"Horror\", \"Womens Fiction\", \"Science Fiction\", \"Science\", \"Music\", \"Business\", \"Travel\", \"Thriller\", \"Philosophy\", \"Humor\", \"Autobiography\", \"Art\", \"Psychology\", \"Religion\", \"Christian Fiction\", \"New Adult\", \"Spirituality\", \"Biography\", \"Sports and Games\", \"Self Help\", \"Health\", \"Politics\", \"Contemporary\", \"Christian\", \"Historical\", \"Cultural\", \"Erotica\", \"Novels\", \"Crime\", \"Parenting\", \"Adult Fiction\", \"Academic\", \"Suspense\", \"Paranormal\", \"Short Stories\"], \"type\": \"bar\", \"uid\": \"313cd8b3-9f67-4a0a-b201-16df2100e9cc\"}], {\"yaxis\": {\"automargin\": true, \"side\": \"right\"}}, {\"showLink\": true, \"linkText\": \"Export to plot.ly\", \"plotlyServerURL\": \"https://plot.ly\"}); \n",
       "}\n",
       "});</script><script type=\"text/javascript\">window.addEventListener(\"resize\", function(){if (document.getElementById(\"46c47489-6a6a-402d-889e-625353e4759a\")) {window._Plotly.Plots.resize(document.getElementById(\"46c47489-6a6a-402d-889e-625353e4759a\"));};})</script>"
      ],
      "text/vnd.plotly.v1+html": [
       "<div id=\"46c47489-6a6a-402d-889e-625353e4759a\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";\n",
       "if (document.getElementById(\"46c47489-6a6a-402d-889e-625353e4759a\")) {\n",
       "    Plotly.newPlot(\"46c47489-6a6a-402d-889e-625353e4759a\", [{\"marker\": {\"color\": \"rgba(226, 74, 51, 0.6)\", \"line\": {\"color\": \"rgba(226, 74, 51, 1.0)\", \"width\": 1}}, \"name\": \"category\", \"orientation\": \"h\", \"text\": \"\", \"x\": [152, 110, 75, 67, 65, 54, 48, 35, 32, 30, 29, 26, 19, 19, 18, 17, 17, 16, 14, 13, 12, 11, 11, 11, 10, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 4, 3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], \"y\": [\"Default\", \"Nonfiction\", \"Sequential Art\", \"Add a comment\", \"Fiction\", \"Young Adult\", \"Fantasy\", \"Romance\", \"Mystery\", \"Food and Drink\", \"Childrens\", \"Historical Fiction\", \"Poetry\", \"Classics\", \"History\", \"Horror\", \"Womens Fiction\", \"Science Fiction\", \"Science\", \"Music\", \"Business\", \"Travel\", \"Thriller\", \"Philosophy\", \"Humor\", \"Autobiography\", \"Art\", \"Psychology\", \"Religion\", \"Christian Fiction\", \"New Adult\", \"Spirituality\", \"Biography\", \"Sports and Games\", \"Self Help\", \"Health\", \"Politics\", \"Contemporary\", \"Christian\", \"Historical\", \"Cultural\", \"Erotica\", \"Novels\", \"Crime\", \"Parenting\", \"Adult Fiction\", \"Academic\", \"Suspense\", \"Paranormal\", \"Short Stories\"], \"type\": \"bar\", \"uid\": \"313cd8b3-9f67-4a0a-b201-16df2100e9cc\"}], {\"yaxis\": {\"automargin\": true, \"side\": \"right\"}}, {\"showLink\": true, \"linkText\": \"Export to plot.ly\", \"plotlyServerURL\": \"https://plot.ly\"}); \n",
       "}\n",
       "});</script><script type=\"text/javascript\">window.addEventListener(\"resize\", function(){if (document.getElementById(\"46c47489-6a6a-402d-889e-625353e4759a\")) {window._Plotly.Plots.resize(document.getElementById(\"46c47489-6a6a-402d-889e-625353e4759a\"));};})</script>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "a.report_all()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
