[components][m] - move catalog to @portaljs/components

2023-05-04 11:14:39 -03:00
parent cf2a93abfd
commit ad52721a38
9 changed files with 284 additions and 53 deletions
--- a/packages/components/package-lock.json
+++ b/packages/components/package-lock.json
@@ -1,19 +1,22 @@
 {
  "name": "@portaljs/components",
-  "version": "0.0.1",
+  "version": "0.0.3",
  "lockfileVersion": 2,
  "requires": true,
  "packages": {
    "": {
      "name": "@portaljs/components",
-      "version": "0.0.1",
+      "version": "0.0.3",
      "dependencies": {
        "@heroicons/react": "^2.0.17",
        "@tanstack/react-table": "^8.8.5",
+        "@types/flexsearch": "^0.7.3",
+        "flexsearch": "0.7.21",
        "next-mdx-remote": "^4.4.1",
        "papaparse": "^5.4.1",
        "react": "^18.2.0",
        "react-dom": "^18.2.0",
+        "react-hook-form": "^7.43.9",
        "react-vega": "^7.6.0",
        "vega": "5.20.2",
        "vega-lite": "5.1.0"
@@ -4466,6 +4469,11 @@
      "dev": true,
      "license": "MIT"
    },
+    "node_modules/@types/flexsearch": {
+      "version": "0.7.3",
+      "resolved": "https://registry.npmjs.org/@types/flexsearch/-/flexsearch-0.7.3.tgz",
+      "integrity": "sha512-HXwADeHEP4exXkCIwy2n1+i0f1ilP1ETQOH5KDOugjkTFZPntWo0Gr8stZOaebkxsdx+k0X/K6obU/+it07ocg=="
+    },
    "node_modules/@types/glob": {
      "version": "8.1.0",
      "dev": true,
@@ -8008,6 +8016,11 @@
      "dev": true,
      "license": "ISC"
    },
+    "node_modules/flexsearch": {
+      "version": "0.7.21",
+      "resolved": "https://registry.npmjs.org/flexsearch/-/flexsearch-0.7.21.tgz",
+      "integrity": "sha512-W7cHV7Hrwjid6lWmy0IhsWDFQboWSng25U3VVywpHOTJnnAZNPScog67G+cVpeX9f7yDD21ih0WDrMMT+JoaYg=="
+    },
    "node_modules/flow-parser": {
      "version": "0.205.0",
      "dev": true,
@@ -11993,6 +12006,21 @@
      "dev": true,
      "license": "MIT"
    },
+    "node_modules/react-hook-form": {
+      "version": "7.43.9",
+      "resolved": "https://registry.npmjs.org/react-hook-form/-/react-hook-form-7.43.9.tgz",
+      "integrity": "sha512-AUDN3Pz2NSeoxQ7Hs6OhQhDr6gtF9YRuutGDwPQqhSUAHJSgGl2VeY3qN19MG0SucpjgDiuMJ4iC5T5uB+eaNQ==",
+      "engines": {
+        "node": ">=12.22.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/react-hook-form"
+      },
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17 || ^18"
+      }
+    },
    "node_modules/react-inspector": {
      "version": "6.0.1",
      "dev": true,
@@ -18010,6 +18038,11 @@
      "version": "3.2.1",
      "dev": true
    },
+    "@types/flexsearch": {
+      "version": "0.7.3",
+      "resolved": "https://registry.npmjs.org/@types/flexsearch/-/flexsearch-0.7.3.tgz",
+      "integrity": "sha512-HXwADeHEP4exXkCIwy2n1+i0f1ilP1ETQOH5KDOugjkTFZPntWo0Gr8stZOaebkxsdx+k0X/K6obU/+it07ocg=="
+    },
    "@types/glob": {
      "version": "8.1.0",
      "dev": true,
@@ -20245,6 +20278,11 @@
      "version": "3.2.7",
      "dev": true
    },
+    "flexsearch": {
+      "version": "0.7.21",
+      "resolved": "https://registry.npmjs.org/flexsearch/-/flexsearch-0.7.21.tgz",
+      "integrity": "sha512-W7cHV7Hrwjid6lWmy0IhsWDFQboWSng25U3VVywpHOTJnnAZNPScog67G+cVpeX9f7yDD21ih0WDrMMT+JoaYg=="
+    },
    "flow-parser": {
      "version": "0.205.0",
      "dev": true
@@ -22587,6 +22625,12 @@
        }
      }
    },
+    "react-hook-form": {
+      "version": "7.43.9",
+      "resolved": "https://registry.npmjs.org/react-hook-form/-/react-hook-form-7.43.9.tgz",
+      "integrity": "sha512-AUDN3Pz2NSeoxQ7Hs6OhQhDr6gtF9YRuutGDwPQqhSUAHJSgGl2VeY3qN19MG0SucpjgDiuMJ4iC5T5uB+eaNQ==",
+      "requires": {}
+    },
    "react-inspector": {
      "version": "6.0.1",
      "dev": true,
--- a/packages/components/package.json
+++ b/packages/components/package.json
@@ -25,14 +25,17 @@
  },
  "dependencies": {
    "@heroicons/react": "^2.0.17",
+    "@tanstack/react-table": "^8.8.5",
+    "@types/flexsearch": "^0.7.3",
+    "flexsearch": "0.7.21",
    "next-mdx-remote": "^4.4.1",
    "papaparse": "^5.4.1",
    "react": "^18.2.0",
    "react-dom": "^18.2.0",
+    "react-hook-form": "^7.43.9",
    "react-vega": "^7.6.0",
    "vega": "5.20.2",
-    "vega-lite": "5.1.0",
-    "@tanstack/react-table": "^8.8.5"
+    "vega-lite": "5.1.0"
  },
  "devDependencies": {
    "@storybook/addon-essentials": "^7.0.7",
--- a/packages/components/src/components/Catalog.tsx
+++ b/packages/components/src/components/Catalog.tsx
@@ -0,0 +1,119 @@
+import { Index } from 'flexsearch';
+import { useState } from 'react';
+import DebouncedInput from './DebouncedInput';
+import { useForm } from 'react-hook-form';
+
+export function Catalog({
+  datasets,
+  facets,
+}: {
+  datasets: any[];
+  facets: string[];
+}) {
+  const [indexFilter, setIndexFilter] = useState('');
+  const index = new Index({ tokenize: 'full' });
+  datasets.forEach((dataset) =>
+    index.add(
+      dataset._id,
+      //This will join every metadata value + the url_path into one big string and index that
+      Object.entries(dataset.metadata).reduce(
+        (acc, curr) => acc + ' ' + curr[1].toString(),
+        ''
+      ) +
+        ' ' +
+        dataset.url_path
+    )
+  );
+
+  const facetValues = facets
+    ? facets.reduce((acc, facet) => {
+        const possibleValues = datasets.reduce((acc, curr) => {
+          const facetValue = curr.metadata[facet];
+          if (facetValue) {
+            return Array.isArray(facetValue)
+              ? acc.concat(facetValue)
+              : acc.concat([facetValue]);
+          }
+          return acc;
+        }, []);
+        acc[facet] = {
+          possibleValues: [...new Set(possibleValues)],
+          selectedValue: null,
+        };
+        return acc;
+      }, {})
+    : [];
+
+  const { register, watch } = useForm(facetValues);
+
+  const filteredDatasets = datasets
+    // First filter by flex search
+    .filter((dataset) =>
+      indexFilter !== ''
+        ? index.search(indexFilter).includes(dataset._id)
+        : true
+    )
+    //Then check if the selectedValue for the given facet is included in the dataset metadata
+    .filter((dataset) => {
+      //Avoids a server rendering breakage
+      if (!watch() || Object.keys(watch()).length === 0) return true
+      //This will filter only the key pairs of the metadata values that were selected as facets
+      const datasetFacets = Object.entries(dataset.metadata).filter((entry) =>
+        facets.includes(entry[0])
+      );
+      //Check if the value present is included in the selected value in the form
+      return datasetFacets.every((elem) =>
+        watch()[elem[0]].selectedValue
+          ? (elem[1] as string | string[]).includes(
+              watch()[elem[0]].selectedValue
+            )
+          : true
+      );
+    });
+
+  return (
+    <>
+      <DebouncedInput
+        value={indexFilter ?? ''}
+        onChange={(value) => setIndexFilter(String(value))}
+        className="p-2 text-sm shadow border border-block mr-1"
+        placeholder="Search all datasets..."
+      />
+      {Object.entries(facetValues).map((elem) => (
+        <select
+          key={elem[0]}
+          defaultValue=""
+          className="p-2 ml-1 text-sm shadow border border-block"
+          {...register(elem[0] + '.selectedValue')}
+        >
+          <option value="">
+            Filter by {elem[0]}
+          </option>
+          {(elem[1] as { possibleValues: string[] }).possibleValues.map(
+            (val) => (
+              <option
+                key={val}
+                className="dark:bg-white dark:text-black"
+                value={val}
+              >
+                {val}
+              </option>
+            )
+          )}
+        </select>
+      ))}
+      <ul className='mb-5 pl-6 mt-5 list-disc'>
+        {filteredDatasets.map((dataset) => (
+          <li className='py-2' key={dataset._id}>
+            <a className='font-medium underline' href={dataset.url_path}>
+              {dataset.metadata.title
+                ? dataset.metadata.title
+                : dataset.url_path}
+            </a>
+          </li>
+        ))}
+      </ul>
+    </>
+  );
+}
+
--- a/packages/components/src/index.ts
+++ b/packages/components/src/index.ts
@@ -1,4 +1,5 @@
 export * from "./components/Table";
+export * from "./components/Catalog";
 export * from "./components/LineChart";
 export * from "./components/Vega";
-export * from "./components/VegaLite";
+export * from "./components/VegaLite";
--- a/packages/components/stories/Catalog.stories.ts
+++ b/packages/components/stories/Catalog.stories.ts
@@ -0,0 +1,226 @@
+import type { Meta, StoryObj } from '@storybook/react';
+
+import { Catalog } from '../src/components/Catalog';
+
+// More on how to set up stories at: https://storybook.js.org/docs/react/writing-stories/introduction
+const meta: Meta = {
+  title: 'Components/Catalog',
+  component: Catalog,
+  tags: ['autodocs'],
+  argTypes: {
+    datasets: {
+      description:
+        'Lists of datasets to be displayed in the list, will usually be automatically available',
+    },
+    facets: {
+      description:
+        'List of frontmatter fields that should be used as filters, needs to match exactly with the field name',
+    },
+  },
+};
+
+export default meta;
+
+type Story = StoryObj<{ datasets: any; facets: string[] }>;
+
+// More on writing stories with args: https://storybook.js.org/docs/react/writing-stories/args
+export const WithoutFacets: Story = {
+  name: 'Catalog without facets',
+  args: {
+    datasets: [
+      {
+        _id: '07026b22d49916754df1dc8ffb9ccd1c31878aae',
+        url_path: 'dataset-4',
+        file_path: 'content/dataset-4/index.md',
+        metadata: {
+          title: 'Detecting Abusive Albanian',
+          'link-to-publication': 'https://arxiv.org/abs/2107.13592',
+          'link-to-data': 'https://doi.org/10.6084/m9.figshare.19333298.v1',
+          'task-description':
+            'Hierarchical (offensive/not; untargeted/targeted; person/group/other)',
+          'details-of-task':
+            'Detect and categorise abusive language in social media data',
+          'size-of-dataset': 11874,
+          'percentage-abusive': 13.2,
+          language: 'Albanian',
+          'level-of-annotation': ['Posts'],
+          platform: ['Instagram', 'Youtube'],
+          medium: ['Text'],
+          reference:
+            'Nurce, E., Keci, J., Derczynski, L., 2021. Detecting Abusive Albanian. arXiv:2107.13592',
+        },
+      },
+      {
+        _id: '42c86cf3c4fbbab11d91c2a7d6dcb8f750bc4e19',
+        url_path: 'dataset-1',
+        file_path: 'content/dataset-1/index.md',
+        metadata: {
+          title: 'AbuseEval v1.0',
+          'link-to-publication':
+            'http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.760.pdf',
+          'link-to-data': 'https://github.com/tommasoc80/AbuseEval',
+          'task-description':
+            'Explicitness annotation of offensive and abusive content',
+          'details-of-task':
+            'Enriched versions of the OffensEval/OLID dataset with the distinction of explicit/implicit offensive messages and the new dimension for abusive messages. Labels for offensive language: EXPLICIT, IMPLICT, NOT; Labels for abusive language: EXPLICIT, IMPLICT, NOTABU',
+          'size-of-dataset': 14100,
+          'percentage-abusive': 20.75,
+          language: 'English',
+          'level-of-annotation': ['Tweets'],
+          platform: ['Twitter'],
+          medium: ['Text'],
+          reference:
+            'Caselli, T., Basile, V., Jelena, M., Inga, K., and Michael, G. 2020. "I feel offended, don’t be abusive! implicit/explicit messages in offensive and abusive language". The 12th Language Resources and Evaluation Conference (pp. 6193-6202). European Language Resources Association.',
+        },
+      },
+      {
+        _id: '80001dd32a752421fdcc64e91fbd237dc31d6bb3',
+        url_path: 'dataset-2',
+        file_path: 'content/dataset-2/index.md',
+        metadata: {
+          title:
+            'Abusive Language Detection on Arabic Social Media (Al Jazeera)',
+          'link-to-publication': 'https://www.aclweb.org/anthology/W17-3008',
+          'link-to-data':
+            'http://alt.qcri.org/~hmubarak/offensive/AJCommentsClassification-CF.xlsx',
+          'task-description':
+            'Ternary (Obscene, Offensive but not obscene, Clean)',
+          'details-of-task': 'Incivility',
+          'size-of-dataset': 32000,
+          'percentage-abusive': 0.81,
+          language: 'Arabic',
+          'level-of-annotation': ['Posts'],
+          platform: ['AlJazeera'],
+          medium: ['Text'],
+          reference:
+            'Mubarak, H., Darwish, K. and Magdy, W., 2017. Abusive Language Detection on Arabic Social Media. In: Proceedings of the First Workshop on Abusive Language Online. Vancouver, Canada: Association for Computational Linguistics, pp.52-56.',
+        },
+      },
+      {
+        _id: '96649d05d8193f4333b10015af76c6562971bd8c',
+        url_path: 'dataset-3',
+        file_path: 'content/dataset-3/index.md',
+        metadata: {
+          title: 'CoRAL: a Context-aware Croatian Abusive Language Dataset',
+          'link-to-publication':
+            'https://aclanthology.org/2022.findings-aacl.21/',
+          'link-to-data':
+            'https://github.com/shekharRavi/CoRAL-dataset-Findings-of-the-ACL-AACL-IJCNLP-2022',
+          'task-description':
+            'Multi-class based on context dependency categories (CDC)',
+          'details-of-task': 'Detectioning CDC from abusive comments',
+          'size-of-dataset': 2240,
+          'percentage-abusive': 100,
+          language: 'Croatian',
+          'level-of-annotation': ['Posts'],
+          platform: ['Posts'],
+          medium: ['Newspaper Comments'],
+          reference:
+            'Ravi Shekhar, Mladen Karan and Matthew Purver (2022). CoRAL: a Context-aware Croatian Abusive Language Dataset. Findings of the ACL: AACL-IJCNLP.',
+        },
+      },
+    ],
+  },
+};
+;
+
+export const WithFacets: Story = {
+  name: 'Catalog with facets',
+  args: {
+    datasets: [
+      {
+        _id: '07026b22d49916754df1dc8ffb9ccd1c31878aae',
+        url_path: 'dataset-4',
+        file_path: 'content/dataset-4/index.md',
+        metadata: {
+          title: 'Detecting Abusive Albanian',
+          'link-to-publication': 'https://arxiv.org/abs/2107.13592',
+          'link-to-data': 'https://doi.org/10.6084/m9.figshare.19333298.v1',
+          'task-description':
+            'Hierarchical (offensive/not; untargeted/targeted; person/group/other)',
+          'details-of-task':
+            'Detect and categorise abusive language in social media data',
+          'size-of-dataset': 11874,
+          'percentage-abusive': 13.2,
+          language: 'Albanian',
+          'level-of-annotation': ['Posts'],
+          platform: ['Instagram', 'Youtube'],
+          medium: ['Text'],
+          reference:
+            'Nurce, E., Keci, J., Derczynski, L., 2021. Detecting Abusive Albanian. arXiv:2107.13592',
+        },
+      },
+      {
+        _id: '42c86cf3c4fbbab11d91c2a7d6dcb8f750bc4e19',
+        url_path: 'dataset-1',
+        file_path: 'content/dataset-1/index.md',
+        metadata: {
+          title: 'AbuseEval v1.0',
+          'link-to-publication':
+            'http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.760.pdf',
+          'link-to-data': 'https://github.com/tommasoc80/AbuseEval',
+          'task-description':
+            'Explicitness annotation of offensive and abusive content',
+          'details-of-task':
+            'Enriched versions of the OffensEval/OLID dataset with the distinction of explicit/implicit offensive messages and the new dimension for abusive messages. Labels for offensive language: EXPLICIT, IMPLICT, NOT; Labels for abusive language: EXPLICIT, IMPLICT, NOTABU',
+          'size-of-dataset': 14100,
+          'percentage-abusive': 20.75,
+          language: 'English',
+          'level-of-annotation': ['Tweets'],
+          platform: ['Twitter'],
+          medium: ['Text'],
+          reference:
+            'Caselli, T., Basile, V., Jelena, M., Inga, K., and Michael, G. 2020. "I feel offended, don’t be abusive! implicit/explicit messages in offensive and abusive language". The 12th Language Resources and Evaluation Conference (pp. 6193-6202). European Language Resources Association.',
+        },
+      },
+      {
+        _id: '80001dd32a752421fdcc64e91fbd237dc31d6bb3',
+        url_path: 'dataset-2',
+        file_path: 'content/dataset-2/index.md',
+        metadata: {
+          title:
+            'Abusive Language Detection on Arabic Social Media (Al Jazeera)',
+          'link-to-publication': 'https://www.aclweb.org/anthology/W17-3008',
+          'link-to-data':
+            'http://alt.qcri.org/~hmubarak/offensive/AJCommentsClassification-CF.xlsx',
+          'task-description':
+            'Ternary (Obscene, Offensive but not obscene, Clean)',
+          'details-of-task': 'Incivility',
+          'size-of-dataset': 32000,
+          'percentage-abusive': 0.81,
+          language: 'Arabic',
+          'level-of-annotation': ['Posts'],
+          platform: ['AlJazeera'],
+          medium: ['Text'],
+          reference:
+            'Mubarak, H., Darwish, K. and Magdy, W., 2017. Abusive Language Detection on Arabic Social Media. In: Proceedings of the First Workshop on Abusive Language Online. Vancouver, Canada: Association for Computational Linguistics, pp.52-56.',
+        },
+      },
+      {
+        _id: '96649d05d8193f4333b10015af76c6562971bd8c',
+        url_path: 'dataset-3',
+        file_path: 'content/dataset-3/index.md',
+        metadata: {
+          title: 'CoRAL: a Context-aware Croatian Abusive Language Dataset',
+          'link-to-publication':
+            'https://aclanthology.org/2022.findings-aacl.21/',
+          'link-to-data':
+            'https://github.com/shekharRavi/CoRAL-dataset-Findings-of-the-ACL-AACL-IJCNLP-2022',
+          'task-description':
+            'Multi-class based on context dependency categories (CDC)',
+          'details-of-task': 'Detectioning CDC from abusive comments',
+          'size-of-dataset': 2240,
+          'percentage-abusive': 100,
+          language: 'Croatian',
+          'level-of-annotation': ['Posts'],
+          platform: ['Posts'],
+          medium: ['Newspaper Comments'],
+          reference:
+            'Ravi Shekhar, Mladen Karan and Matthew Purver (2022). CoRAL: a Context-aware Croatian Abusive Language Dataset. Findings of the ACL: AACL-IJCNLP.',
+        },
+      },
+    ],
+    facets: ['language', 'platform']
+  },
+};
+;