[alan-turing][m] - individual pages (#828)

This commit is contained in:
Luccas Mateus 2023-05-01 21:06:52 -03:00 committed by GitHub
parent a041d69282
commit 026059184a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 6022 additions and 404 deletions

View File

@ -0,0 +1,22 @@
---
title: Contributing
---
We accept entries to our catalogue based on pull requests to the content folder. The dataset must be avaliable for download to be included in the list. If you want to add an entry, follow these steps!
Please send just one dataset addition/edit at a time - edit it in, then save. This will make everyones life easier (including yours!)
- Go to the repo url file and click the "Add file" dropdown and then click on "Create new file".
![](https://i.imgur.com/2PR0ZgL.png)
- In the following page type `content/datasets/<name-of-the-file>.md`. if you want to add an entry to the datasets catalog or `content/keywords/<name-of-the-file>.md` if you want to add an entry to the lists of abusive keywords.
![](https://i.imgur.com/rr3uSYu.png)
- Copy the contents of `templates/dataset.md` or `templates/keywords.md` respectively to the camp below, filling out the fields with the correct data format
![](https://i.imgur.com/x6JIjhz.png)
- Click on "Commit changes", on the popup make sure you give some brief detail on the proposed change. and then click on Propose changes
![](https://i.imgur.com/BxuxKEJ.png)
- Submit the pull request on the next page when prompted.

View File

@ -12,3 +12,5 @@ platform: ["AlJazeera"]
medium: ["Text"]
reference: "Mubarak, H., Darwish, K. and Magdy, W., 2017. Abusive Language Detection on Arabic Social Media. In: Proceedings of the First Workshop on Abusive Language Online. Vancouver, Canada: Association for Computational Linguistics, pp.52-56."
---
SOMETHING TEST

View File

@ -12,3 +12,4 @@ platform: ["Youtube", "Facebook"]
medium: ["Text"]
reference: "Romim, N., Ahmed, M., Talukder, H., & Islam, M. S. (2021). Hate speech detection in the bengali language: A dataset and its baseline evaluation. In Proceedings of International Joint Conference on Advances in Computational Intelligence (pp. 457-468). Springer, Singapore."
---

View File

@ -1,3 +1,7 @@
---
title: Hate Speech Dataset Catalogue
---
This page catalogues datasets annotated for hate speech, online abuse, and offensive language. They may be useful for e.g. training a natural language processing system to detect this language.
The list is maintained by Leon Derczynski, Bertie Vidgen, Hannah Rose Kirk, Pica Johansson, Yi-Ling Chung, Mads Guldborg Kjeldgaard Kongsbak, Laila Sprejer, and Philine Zeinert.

View File

@ -0,0 +1,10 @@
---
title: Hurtlex
description: HurtLex is a lexicon of offensive, aggressive, and hateful words in over 50 languages. The words are divided into 17 categories, plus a macro-category indicating whether there is stereotype involved.
data-link: https://github.com/valeriobasile/hurtlex
reference: http://ceur-ws.org/Vol-2253/paper49.pdf, Proc. CLiC-it 2018
---
## Markdown TEST
Some text

View File

@ -0,0 +1,5 @@
---
title: SexHateLex is a Chinese lexicon of hateful and sexist words.
data-link: https://doi.org/10.5281/zenodo.4773875
reference: http://ceur-ws.org/Vol-2253/paper49.pdf, Journal of OSNEM, Vol.27, 2022, 100182, ISSN 2468-6964.
---

View File

@ -0,0 +1,105 @@
import matter from "gray-matter";
import mdxmermaid from "mdx-mermaid";
import { h } from "hastscript";
import remarkCallouts from "@flowershow/remark-callouts";
import remarkEmbed from "@flowershow/remark-embed";
import remarkGfm from "remark-gfm";
import remarkMath from "remark-math";
import remarkSmartypants from "remark-smartypants";
import remarkToc from "remark-toc";
import remarkWikiLink from "@flowershow/remark-wiki-link";
import rehypeAutolinkHeadings from "rehype-autolink-headings";
import rehypeKatex from "rehype-katex";
import rehypeSlug from "rehype-slug";
import rehypePrismPlus from "rehype-prism-plus";
import { serialize } from "next-mdx-remote/serialize";
/**
* Parse a markdown or MDX file to an MDX source form + front matter data
*
* @source: the contents of a markdown or mdx file
* @format: used to indicate to next-mdx-remote which format to use (md or mdx)
* @returns: { mdxSource: mdxSource, frontMatter: ...}
*/
const parse = async function (source, format) {
const { content, data, excerpt } = matter(source, {
excerpt: (file, options) => {
// Generate an excerpt for the file
file.excerpt = file.content.split("\n\n")[0];
},
});
const mdxSource = await serialize(
{ value: content, path: format },
{
// Optionally pass remark/rehype plugins
mdxOptions: {
remarkPlugins: [
remarkEmbed,
remarkGfm,
[remarkSmartypants, { quotes: false, dashes: "oldschool" }],
remarkMath,
remarkCallouts,
remarkWikiLink,
[
remarkToc,
{
heading: "Table of contents",
tight: true,
},
],
[mdxmermaid, {}],
],
rehypePlugins: [
rehypeSlug,
[
rehypeAutolinkHeadings,
{
properties: { className: 'heading-link' },
test(element) {
return (
["h2", "h3", "h4", "h5", "h6"].includes(element.tagName) &&
element.properties?.id !== "table-of-contents" &&
element.properties?.className !== "blockquote-heading"
);
},
content() {
return [
h(
"svg",
{
xmlns: "http:www.w3.org/2000/svg",
fill: "#ab2b65",
viewBox: "0 0 20 20",
className: "w-5 h-5",
},
[
h("path", {
fillRule: "evenodd",
clipRule: "evenodd",
d: "M9.493 2.853a.75.75 0 00-1.486-.205L7.545 6H4.198a.75.75 0 000 1.5h3.14l-.69 5H3.302a.75.75 0 000 1.5h3.14l-.435 3.148a.75.75 0 001.486.205L7.955 14h2.986l-.434 3.148a.75.75 0 001.486.205L12.456 14h3.346a.75.75 0 000-1.5h-3.14l.69-5h3.346a.75.75 0 000-1.5h-3.14l.435-3.147a.75.75 0 00-1.486-.205L12.045 6H9.059l.434-3.147zM8.852 7.5l-.69 5h2.986l.69-5H8.852z",
}),
]
),
];
},
},
],
[rehypeKatex, { output: "mathml" }],
[rehypePrismPlus, { ignoreMissing: true }],
],
format,
},
scope: data,
}
);
return {
mdxSource: mdxSource,
frontMatter: data,
excerpt,
};
};
export default parse;

View File

@ -0,0 +1,5 @@
/// <reference types="next" />
/// <reference types="next/image-types/global" />
// NOTE: This file should not be edited
// see https://nextjs.org/docs/basic-features/typescript for more information.

File diff suppressed because it is too large Load Diff

View File

@ -26,17 +26,42 @@
"feed": "^4.2.2",
"flexsearch": "^0.7.31",
"focus-visible": "^5.2.0",
"next": "13.3.0",
"next-router-mock": "^0.9.3",
"next-superjson-plugin": "^0.5.7",
"postcss-focus-visible": "^6.0.4",
"react": "18.2.0",
"react-dom": "18.2.0",
"react-hook-form": "^7.43.9",
"react-markdown": "^8.0.7",
"remark-gfm": "^3.0.1",
"superjson": "^1.12.3",
"tailwindcss": "^3.3.0"
"tailwindcss": "^3.3.0",
"@flowershow/core": "^0.4.10",
"@flowershow/remark-callouts": "^1.0.0",
"@flowershow/remark-embed": "^1.0.0",
"@flowershow/remark-wiki-link": "^1.1.2",
"@heroicons/react": "^2.0.17",
"@opentelemetry/api": "^1.4.0",
"@tanstack/react-table": "^8.8.5",
"@types/node": "18.16.0",
"@types/react": "18.2.0",
"@types/react-dom": "18.2.0",
"eslint": "8.39.0",
"eslint-config-next": "13.3.1",
"gray-matter": "^4.0.3",
"hastscript": "^7.2.0",
"mdx-mermaid": "2.0.0-rc7",
"next": "13.2.1",
"next-mdx-remote": "^4.4.1",
"papaparse": "^5.4.1",
"react": "18.2.0",
"react-dom": "18.2.0",
"react-vega": "^7.6.0",
"rehype-autolink-headings": "^6.1.1",
"rehype-katex": "^6.0.3",
"rehype-prism-plus": "^1.5.1",
"rehype-slug": "^5.1.0",
"remark-gfm": "^3.0.1",
"remark-math": "^5.1.1",
"remark-smartypants": "^2.0.0",
"remark-toc": "^8.0.1"
},
"devDependencies": {
"eslint": "8.26.0",

View File

@ -0,0 +1,99 @@
import { Container } from '../components/Container'
import clientPromise from '../lib/mddb'
import fs from 'fs'
import { MDXRemote } from 'next-mdx-remote'
import { serialize } from 'next-mdx-remote/serialize'
import { Card } from '../components/Card'
export const getStaticProps = async ({ params }) => {
const urlPath = params.slug ? params.slug.join('/') : ''
const mddb = await clientPromise
const dbFile = await mddb.getFileByUrl(urlPath)
const source = fs.readFileSync(dbFile.file_path, { encoding: 'utf-8' })
const mdxSource = await serialize(source, { parseFrontmatter: true })
return {
props: {
mdxSource,
},
}
}
export async function getStaticPaths() {
const mddb = await clientPromise
const allDocuments = await mddb.getFiles({ extensions: ['md', 'mdx'] })
const paths = allDocuments.map((page) => {
const parts = page.url_path.split('/')
return { params: { slug: parts } }
})
return {
paths,
fallback: false,
}
}
const isValidUrl = (urlString) => {
try {
return Boolean(new URL(urlString))
} catch (e) {
return false
}
}
const Meta = ({keyValuePairs}) => {
const prettifyMetaValue = (value) => value.replaceAll('-',' ').charAt(0).toUpperCase() + value.replaceAll('-',' ').slice(1);
return (
<>
{keyValuePairs.map((entry) => {
return isValidUrl(entry[1]) ? (
<Card.Description>
<span className="font-semibold">
{prettifyMetaValue(entry[0])}: {' '}
</span>
<a
className="text-ellipsis underline transition hover:text-teal-400 dark:hover:text-teal-900"
href={entry[1]}
>
{entry[1]}
</a>
</Card.Description>
) : (
<Card.Description>
<span className="font-semibold">{prettifyMetaValue(entry[0])}: </span>
{Array.isArray(entry[1]) ? entry[1].join(', ') : entry[1]}
</Card.Description>
)
})}
</>
)
}
export default function DRDPage({ mdxSource }) {
const meta = mdxSource.frontmatter
const keyValuePairs = Object.entries(meta).filter(
(entry) => entry[0] !== 'title'
)
return (
<>
<Container className="mt-16 lg:mt-32">
<article>
<header className="flex flex-col">
<h1 className="mt-6 text-4xl font-bold tracking-tight text-zinc-800 dark:text-zinc-100 sm:text-5xl">
{meta.title}
</h1>
<Card as="article">
<Meta keyValuePairs={keyValuePairs} />
</Card>
</header>
<div className="prose dark:prose-invert">
<MDXRemote {...mdxSource} />
</div>
</article>
</Container>
</>
)
}

View File

@ -3,19 +3,22 @@ import fs from 'fs'
import { Card } from '../components/Card'
import { Container } from '../components/Container'
import clientPromise from '@/lib/mddb'
import clientPromise from '../lib/mddb'
import ReactMarkdown from 'react-markdown'
import { Index } from 'flexsearch'
import { useForm } from 'react-hook-form'
import Link from 'next/link'
import { serialize } from 'next-mdx-remote/serialize'
import { MDXRemote } from 'next-mdx-remote'
function DatasetCard({ dataset }) {
return (
<Card as="article">
<Card.Title>{dataset.title}</Card.Title>
<Card.Title><Link href={dataset.url}>{dataset.title}</Link></Card.Title>
<Card.Description>
<span className="font-semibold">Link to publication: </span>{' '}
<a
className="underline transition hover:text-teal-400 dark:hover:text-teal-900 text-ellipsis"
className="text-ellipsis underline transition hover:text-teal-400 dark:hover:text-teal-900"
href={dataset['link-to-publication']}
>
{dataset['link-to-publication']}
@ -24,7 +27,7 @@ function DatasetCard({ dataset }) {
<Card.Description>
<span className="font-semibold">Link to data: </span>
<a
className="underline transition hover:text-teal-600 dark:hover:text-teal-900 text-ellipsis"
className="text-ellipsis underline transition hover:text-teal-600 dark:hover:text-teal-900"
href={dataset['link-to-data']}
>
{dataset['link-to-data']}
@ -69,14 +72,61 @@ function DatasetCard({ dataset }) {
</Card>
)
}
export default function Home({ datasets, indexText, availableLanguages, availablePlatforms }) {
function ListOfAbusiveKeywordsCard({ list }) {
return (
<Card as="article">
<Card.Title><Link href={list.url}>{list.title}</Link></Card.Title>
{list.description && (
<Card.Description>
<span className="font-semibold">List Description: </span>{' '}
{list.description}
</Card.Description>
)}
<Card.Description>
<span className="font-semibold">Data Link: </span>
<a
className="text-ellipsis underline transition hover:text-teal-600 dark:hover:text-teal-900"
href={list['data-link']}
>
{list['data-link']}
</a>
</Card.Description>
<Card.Description>
<span className="font-semibold">Reference: </span>
<a
className="text-ellipsis underline transition hover:text-teal-600 dark:hover:text-teal-900"
href={list.reference}
>
{list.reference}
</a>
</Card.Description>
</Card>
)
}
export default function Home({
datasets,
indexText,
listsOfKeywords,
contributingText,
availableLanguages,
availablePlatforms,
}) {
const index = new Index()
datasets.forEach((dataset) => index.add(dataset.id, `${dataset.title} ${dataset['task-description']} ${dataset['details-of-task']} ${dataset['reference']}`))
const { register, watch } = useForm({ defaultValues: {
searchTerm: '',
lang: '',
platform: ''
}})
datasets.forEach((dataset) =>
index.add(
dataset.id,
`${dataset.title} ${dataset['task-description']} ${dataset['details-of-task']} ${dataset['reference']}`
)
)
const { register, watch, handleSubmit, reset } = useForm({
defaultValues: {
searchTerm: '',
lang: '',
platform: '',
},
})
return (
<>
<Head>
@ -89,49 +139,68 @@ export default function Home({ datasets, indexText, availableLanguages, availabl
<Container className="mt-9">
<div className="max-w-2xl">
<h1 className="text-4xl font-bold tracking-tight text-zinc-800 dark:text-zinc-100 sm:text-5xl">
Hate Speech Dataset Catalogue
{indexText.frontmatter.title}
</h1>
<article className="mt-6 flex flex-col gap-y-2 text-base text-zinc-600 dark:text-zinc-400">
<ReactMarkdown>{indexText}</ReactMarkdown>
<MDXRemote {...indexText} />
</article>
</div>
</Container>
<Container className="mt-24 md:mt-28">
<div className="mx-auto grid max-w-xl grid-cols-1 gap-y-8 lg:max-w-none">
<form className="rounded-2xl border border-zinc-100 px-4 py-6 sm:p-6 dark:border-zinc-700/40">
<div className="mx-auto grid max-w-7xl grid-cols-1 gap-y-8 lg:max-w-none">
<h2 className="text-xl font-bold tracking-tight text-zinc-800 dark:text-zinc-100 sm:text-5xl">
Datasets
</h2>
<form onSubmit={handleSubmit(() => reset())} className="rounded-2xl border border-zinc-100 px-4 py-6 dark:border-zinc-700/40 sm:p-6">
<p className="mt-2 text-lg font-semibold text-zinc-600 dark:text-zinc-100">
Search for datasets
</p>
<div className="mt-6 flex flex-col sm:flex-row gap-3">
<div className="mt-6 flex flex-col gap-3 sm:flex-row">
<input
placeholder="Search here"
aria-label="Hate speech on Twitter"
required
{...register('searchTerm')}
className="min-w-0 flex-auto appearance-none rounded-md border border-zinc-900/10 bg-white px-3 py-[calc(theme(spacing.2)-1px)] shadow-md shadow-zinc-800/5 placeholder:text-zinc-600 focus:border-teal-500 focus:outline-none focus:ring-4 focus:ring-teal-500/10 dark:border-zinc-700 dark:bg-zinc-700/[0.15] dark:text-zinc-200 dark:placeholder:text-zinc-200 dark:focus:border-teal-400 dark:focus:ring-teal-400/10 sm:text-sm"
/>
<select
placeholder="Language"
defaultValue=""
className="min-w-0 flex-auto text-zinc-600 appearance-none rounded-md border border-zinc-900/10 bg-white px-3 py-[calc(theme(spacing.2)-1px)] shadow-md shadow-zinc-800/5 placeholder:text-zinc-400 focus:border-teal-500 focus:outline-none focus:ring-4 focus:ring-teal-500/10 dark:border-zinc-700 dark:bg-zinc-700/[0.15] dark:text-zinc-200 dark:placeholder:text-zinc-500 dark:focus:border-teal-400 dark:focus:ring-teal-400/10 sm:text-sm"
className="min-w-0 flex-auto appearance-none rounded-md border border-zinc-900/10 bg-white px-3 py-[calc(theme(spacing.2)-1px)] text-zinc-600 shadow-md shadow-zinc-800/5 placeholder:text-zinc-400 focus:border-teal-500 focus:outline-none focus:ring-4 focus:ring-teal-500/10 dark:border-zinc-700 dark:bg-zinc-700/[0.15] dark:text-zinc-200 dark:placeholder:text-zinc-500 dark:focus:border-teal-400 dark:focus:ring-teal-400/10 sm:text-sm"
{...register('lang')}
>
<option value="" disabled hidden>Filter by language</option>
<option value="" disabled hidden>
Filter by language
</option>
{availableLanguages.map((lang) => (
<option key={lang} className='dark:bg-white dark:text-black' value={lang}>{lang}</option>
<option
key={lang}
className="dark:bg-white dark:text-black"
value={lang}
>
{lang}
</option>
))}
</select>
<select
placeholder="Platform"
defaultValue=""
className="min-w-0 flex-auto text-zinc-600 appearance-none rounded-md border border-zinc-900/10 bg-white px-3 py-[calc(theme(spacing.2)-1px)] shadow-md shadow-zinc-800/5 placeholder:text-zinc-400 focus:border-teal-500 focus:outline-none focus:ring-4 focus:ring-teal-500/10 dark:border-zinc-700 dark:bg-zinc-700/[0.15] dark:text-zinc-200 dark:placeholder:text-zinc-500 dark:focus:border-teal-400 dark:focus:ring-teal-400/10 sm:text-sm"
className="min-w-0 flex-auto appearance-none rounded-md border border-zinc-900/10 bg-white px-3 py-[calc(theme(spacing.2)-1px)] text-zinc-600 shadow-md shadow-zinc-800/5 placeholder:text-zinc-400 focus:border-teal-500 focus:outline-none focus:ring-4 focus:ring-teal-500/10 dark:border-zinc-700 dark:bg-zinc-700/[0.15] dark:text-zinc-200 dark:placeholder:text-zinc-500 dark:focus:border-teal-400 dark:focus:ring-teal-400/10 sm:text-sm"
{...register('platform')}
>
<option value="" disabled hidden>Filter by platform</option>
<option value="" disabled hidden>
Filter by platform
</option>
{availablePlatforms.map((platform) => (
<option key={platform} className='dark:bg-white dark:text-black' value={platform}>{platform}</option>
<option
key={platform}
className="dark:bg-white dark:text-black"
value={platform}
>
{platform}
</option>
))}
</select>
<button type='submit' className='inline-flex items-center gap-2 justify-center rounded-md py-2 px-3 text-sm outline-offset-2 transition active:transition-none bg-zinc-800 font-semibold text-zinc-100 hover:bg-zinc-700 active:bg-zinc-800 active:text-zinc-100/70 dark:bg-zinc-700 dark:hover:bg-zinc-600 dark:active:bg-zinc-700 dark:active:text-zinc-100/70 flex-none'>Clear filters</button>
</div>
</form>
<div className="flex flex-col gap-16">
@ -157,24 +226,70 @@ export default function Home({ datasets, indexText, availableLanguages, availabl
</div>
</div>
</Container>
<Container className="mt-16">
<h2 className="text-xl font-bold tracking-tight text-zinc-800 dark:text-zinc-100 sm:text-5xl">
Lists of Abusive Keywords
</h2>
<div className="mt-3 flex flex-col gap-16">
{listsOfKeywords.map((list) => (
<ListOfAbusiveKeywordsCard key={list.title} list={list} />
))}
</div>
</Container>
<Container className="mt-16">
<h2 className="text-xl font-bold tracking-tight text-zinc-800 dark:text-zinc-100 sm:text-5xl">
How to contribute
</h2>
<article className="mt-6 flex flex-col gap-y-8 text-base text-zinc-600 dark:text-zinc-400 contributing">
<MDXRemote {...contributingText} />
</article>
</Container>
</>
)
}
export async function getStaticProps() {
const mddb = await clientPromise
const allPages = await mddb.getFiles({ extensions: ['md', 'mdx'] })
const datasets = allPages
.filter((page) => page.url_path !== '/')
.map((page) => ({ ...page.metadata, id: page._id }))
const index = allPages.filter((page) => page.url_path === '/')[0]
const source = fs.readFileSync(index.file_path, { encoding: 'utf-8' })
const availableLanguages = [... new Set(datasets.map((dataset) => dataset.language))]
const availablePlatforms = [... new Set(datasets.map((dataset) => dataset.platform).flat())]
const datasetPages = await mddb.getFiles({
folder: 'datasets',
extensions: ['md', 'mdx'],
})
const datasets = datasetPages.map((page) => ({
...page.metadata,
id: page._id,
url: page.url_path,
}))
const listsOfKeywordsPages = await mddb.getFiles({
folder: 'keywords',
extensions: ['md', 'mdx'],
})
const listsOfKeywords = listsOfKeywordsPages.map((page) => ({
...page.metadata,
id: page._id,
url: page.url_path,
}))
const index = await mddb.getFileByUrl('/')
const contributing = await mddb.getFileByUrl('contributing')
let indexSource = fs.readFileSync(index.file_path, { encoding: 'utf-8' })
let contributingSource = fs.readFileSync(contributing.file_path, {
encoding: 'utf-8',
})
contributingSource = await serialize(contributingSource, { parseFrontmatter: true })
indexSource = await serialize(indexSource, { parseFrontmatter: true })
const availableLanguages = [
...new Set(datasets.map((dataset) => dataset.language)),
]
const availablePlatforms = [
...new Set(datasets.map((dataset) => dataset.platform).flat()),
]
return {
props: {
indexText: source,
datasets,
listsOfKeywords,
indexText: indexSource,
contributingText: contributingSource,
availableLanguages,
availablePlatforms,
},

View File

@ -2,3 +2,7 @@
@import 'tailwindcss/components';
@import './prism.css';
@import 'tailwindcss/utilities';
.contributing li {
margin-bottom: 1.75rem;
}

View File

@ -0,0 +1,14 @@
---
title: string
link-to-publication: url
link-to-data: url
task-description: string
details-of-task: string
size-of-dataset: number
percentage-abusive: number
language: string
level-of-annotation: list eg: ["Posts", "Comments", ...]
platform: list eg: ["Youtube", "Facebook", ...]
medium: list eg: ["Text", "Emojis", "Images", ...]
reference: string
---

View File

@ -0,0 +1,5 @@
---
title: string
data-link: url
reference: string
---

View File

@ -0,0 +1,28 @@
{
"compilerOptions": {
"lib": [
"dom",
"dom.iterable",
"esnext"
],
"allowJs": true,
"skipLibCheck": true,
"strict": false,
"forceConsistentCasingInFileNames": true,
"noEmit": true,
"incremental": true,
"esModuleInterop": true,
"moduleResolution": "node",
"resolveJsonModule": true,
"isolatedModules": true,
"jsx": "preserve"
},
"include": [
"next-env.d.ts",
"**/*.ts",
"**/*.tsx"
],
"exclude": [
"node_modules"
]
}