[alan-turing][m] - small tweaks (#830)

This commit is contained in:
Luccas Mateus 2023-05-02 12:53:10 -03:00 committed by GitHub
parent ed3a26cd6d
commit 014c4c043d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 122 additions and 47 deletions

View File

@ -1,6 +1,18 @@
## Intro
This page catalogues datasets annotated for hate speech, online abuse, and offensive language. They may be useful for e.g. training a natural language processing system to detect this language.
Its built on top of [PortalJS](https://portaljs.org/), it allows you to publish datasets, lists of offensive keywords and static pages, all of those are stored as markdown files inside the `content` folder.
- .md files inside `content/datasets/` will appear on the dataset list section of the homepage and be searchable as well as having a individual page in `datasets/<file name>`
- .md files inside `content/keywords/` will appear on the list of offensive keywords section of the homepage as well as having a individual page in `keywords/<file name>`
- .md files inside `content/` will be converted to static pages in the url `/<file name>` eg: `content/about.md` becomes `/about`
This is also a Next.JS project so you can use the following steps to run the website locally.
## Getting started
To get started with this template, first install the npm dependencies:
To get started first install the npm dependencies:
```bash
npm install
@ -13,7 +25,3 @@ npm run dev
```
Finally, open [http://localhost:3000](http://localhost:3000) in your browser to view the website.
## License
This site template is a commercial product and is licensed under the [Tailwind UI license](https://tailwindui.com/license).

View File

@ -21,7 +21,7 @@ export function Footer() {
<Container.Inner>
<div className="flex flex-col items-center justify-between gap-6 sm:flex-row">
<p className="text-sm font-medium text-zinc-800 dark:text-zinc-200">
hatespeechdata maintained by <a href='https://github.com/leondz'>leondz</a>
Built with <a href='https://portaljs.org'>PortalJS 🌀</a>
</p>
<p className="text-sm text-zinc-400 dark:text-zinc-500">
&copy; {new Date().getFullYear()} Leon Derczynski. All rights

View File

@ -0,0 +1,5 @@
---
title: About
---
This is an about page, left here as an example

View File

@ -1,22 +0,0 @@
---
title: Contributing
---
We accept entries to our catalogue based on pull requests to the content folder. The dataset must be avaliable for download to be included in the list. If you want to add an entry, follow these steps!
Please send just one dataset addition/edit at a time - edit it in, then save. This will make everyones life easier (including yours!)
- Go to the repo url file and click the "Add file" dropdown and then click on "Create new file".
![](https://i.imgur.com/2PR0ZgL.png)
- In the following page type `content/datasets/<name-of-the-file>.md`. if you want to add an entry to the datasets catalog or `content/keywords/<name-of-the-file>.md` if you want to add an entry to the lists of abusive keywords.
![](https://i.imgur.com/rr3uSYu.png)
- Copy the contents of `templates/dataset.md` or `templates/keywords.md` respectively to the camp below, filling out the fields with the correct data format
![](https://i.imgur.com/x6JIjhz.png)
- Click on "Commit changes", on the popup make sure you give some brief detail on the proposed change. and then click on Propose changes
![](https://i.imgur.com/BxuxKEJ.png)
- Submit the pull request on the next page when prompted.

View File

@ -0,0 +1,14 @@
---
title: AbuseEval v1.0
link-to-publication: http://www.lrec-conf.org/proceedings/lrec2020/pdf/2020.lrec-1.760.pdf
link-to-data: https://github.com/tommasoc80/AbuseEval
task-description: Explicitness annotation of offensive and abusive content
details-of-task: "Enriched versions of the OffensEval/OLID dataset with the distinction of explicit/implicit offensive messages and the new dimension for abusive messages. Labels for offensive language: EXPLICIT, IMPLICT, NOT; Labels for abusive language: EXPLICIT, IMPLICT, NOTABU"
size-of-dataset: 14100
percentage-abusive: 20.75
language: English
level-of-annotation: ["Tweets"]
platform: ["Twitter"]
medium: ["Text"]
reference: "Caselli, T., Basile, V., Jelena, M., Inga, K., and Michael, G. 2020. \"I feel offended, dont be abusive! implicit/explicit messages in offensive and abusive language\". The 12th Language Resources and Evaluation Conference (pp. 6193-6202). European Language Resources Association."
---

View File

@ -0,0 +1,14 @@
---
title: "CoRAL: a Context-aware Croatian Abusive Language Dataset"
link-to-publication: https://aclanthology.org/2022.findings-aacl.21/
link-to-data: https://github.com/shekharRavi/CoRAL-dataset-Findings-of-the-ACL-AACL-IJCNLP-2022
task-description: Multi-class based on context dependency categories (CDC)
details-of-task: Detectioning CDC from abusive comments
size-of-dataset: 2240
percentage-abusive: 100
language: "Croatian"
level-of-annotation: ["Posts"]
platform: ["Posts"]
medium: ["Newspaper Comments"]
reference: "Ravi Shekhar, Mladen Karan and Matthew Purver (2022). CoRAL: a Context-aware Croatian Abusive Language Dataset. Findings of the ACL: AACL-IJCNLP."
---

View File

@ -0,0 +1,14 @@
---
title: Large-Scale Hate Speech Detection with Cross-Domain Transfer
link-to-publication: https://aclanthology.org/2022.lrec-1.238/
link-to-data: https://github.com/avaapm/hatespeech
task-description: Three-class (Hate speech, Offensive language, None)
details-of-task: Hate speech detection on social media (Twitter) including 5 target groups (gender, race, religion, politics, sports)
size-of-dataset: "100k English (27593 hate, 30747 offensive, 41660 none)"
percentage-abusive: 58.3
language: English
level-of-annotation: ["Posts"]
platform: ["Twitter"]
medium: ["Text", "Image"]
reference: "Cagri Toraman, Furkan Şahinuç, Eyup Yilmaz. 2022. Large-Scale Hate Speech Detection with Cross-Domain Transfer. In Proceedings of the Thirteenth Language Resources and Evaluation Conference, pages 22152225, Marseille, France. European Language Resources Association."
---

View File

@ -0,0 +1,14 @@
---
title: Measuring Hate Speech
link-to-publication: https://arxiv.org/abs/2009.10277
link-to-data: https://huggingface.co/datasets/ucberkeley-dlab/measuring-hate-speech
task-description: 10 ordinal labels (sentiment, (dis)respect, insult, humiliation, inferior status, violence, dehumanization, genocide, attack/defense, hate speech), which are debiased and aggregated into a continuous hate speech severity score (hate_speech_score) that includes a region for counterspeech & supportive speeech. Includes 8 target identity groups (race/ethnicity, religion, national origin/citizenship, gender, sexual orientation, age, disability, political ideology) and 42 identity subgroups.
details-of-task: Hate speech measurement on social media in English
size-of-dataset: "39,565 comments annotated by 7,912 annotators on 10 ordinal labels, for 1,355,560 total labels."
percentage-abusive: 25
language: English
level-of-annotation: ["Social media comment"]
platform: ["Twitter", "Reddit", "Youtube"]
medium: ["Text"]
reference: "Kennedy, C. J., Bacon, G., Sahn, A., & von Vacano, C. (2020). Constructing interval variables via faceted Rasch measurement and multitask deep learning: a hate speech application. arXiv preprint arXiv:2009.10277."
---

View File

@ -0,0 +1,14 @@
---
title: Offensive Language and Hate Speech Detection for Danish
link-to-publication: http://www.derczynski.com/papers/danish_hsd.pdf
link-to-data: https://figshare.com/articles/Danish_Hate_Speech_Abusive_Language_data/12220805
task-description: "Branching structure of tasks: Binary (Offensive, Not), Within Offensive (Target, Not), Within Target (Individual, Group, Other)"
details-of-task: Group-directed + Person-directed
size-of-dataset: 3600
percentage-abusive: 0.12
language: Danish
level-of-annotation: ["Posts"]
platform: ["Twitter", "Reddit", "Newspaper comments"]
medium: ["Text"]
reference: "Sigurbergsson, G. and Derczynski, L., 2019. Offensive Language and Hate Speech Detection for Danish. ArXiv."
---

View File

@ -11,3 +11,24 @@ We provide a list of datasets and keywords. If you would like to contribute to o
If you use these resources, please cite (and read!) our paper: Directions in Abusive Language Training Data: Garbage In, Garbage Out. And if you would like to find other resources for researching online hate, visit The Alan Turing Institutes Online Hate Research Hub or read The Alan Turing Institutes Reading List on Online Hate and Abuse Research.
If youre looking for a good paper on online hate training datasets (beyond our paper, of course!) then have a look at Resources and benchmark corpora for hate speech detection: a systematic review by Poletto et al. in Language Resources and Evaluation.
## How to contribute
We accept entries to our catalogue based on pull requests to the content folder. The dataset must be avaliable for download to be included in the list. If you want to add an entry, follow these steps!
Please send just one dataset addition/edit at a time - edit it in, then save. This will make everyones life easier (including yours!)
- Go to the repo url file and click the "Add file" dropdown and then click on "Create new file".
![](https://i.imgur.com/2PR0ZgL.png)
- In the following page type `content/datasets/<name-of-the-file>.md`. if you want to add an entry to the datasets catalog or `content/keywords/<name-of-the-file>.md` if you want to add an entry to the lists of abusive keywords, if you want to just add an static page you can leave in the root of `content` it will automatically get assigned an url eg: `/content/about.md` becomes the `/about` page
![](https://i.imgur.com/rr3uSYu.png)
- Copy the contents of `templates/dataset.md` or `templates/keywords.md` respectively to the camp below, filling out the fields with the correct data format
![](https://i.imgur.com/x6JIjhz.png)
- Click on "Commit changes", on the popup make sure you give some brief detail on the proposed change. and then click on Propose changes
<img src='https://i.imgur.com/BxuxKEJ.png' style={{ maxWidth: '50%', margin: '0 auto' }}/>
- Submit the pull request on the next page when prompted.

View File

@ -4,6 +4,7 @@ import fs from 'fs'
import { MDXRemote } from 'next-mdx-remote'
import { serialize } from 'next-mdx-remote/serialize'
import { Card } from '../components/Card'
import Head from 'next/head'
export const getStaticProps = async ({ params }) => {
const urlPath = params.slug ? params.slug.join('/') : ''
@ -79,6 +80,9 @@ export default function DRDPage({ mdxSource }) {
)
return (
<>
<Head>
<title>{meta.title}</title>
</Head>
<Container className="mt-16 lg:mt-32">
<article>
<header className="flex flex-col">

View File

@ -4,7 +4,6 @@ import fs from 'fs'
import { Card } from '../components/Card'
import { Container } from '../components/Container'
import clientPromise from '../lib/mddb'
import ReactMarkdown from 'react-markdown'
import { Index } from 'flexsearch'
import { useForm } from 'react-hook-form'
import Link from 'next/link'
@ -109,7 +108,6 @@ export default function Home({
datasets,
indexText,
listsOfKeywords,
contributingText,
availableLanguages,
availablePlatforms,
}) {
@ -141,7 +139,7 @@ export default function Home({
<h1 className="text-4xl font-bold tracking-tight text-zinc-800 dark:text-zinc-100 sm:text-5xl">
{indexText.frontmatter.title}
</h1>
<article className="mt-6 flex flex-col gap-y-2 text-base text-zinc-600 dark:text-zinc-400">
<article className="mt-6 index-text flex flex-col gap-y-2 text-base text-zinc-600 dark:text-zinc-400 prose dark:prose-invert">
<MDXRemote {...indexText} />
</article>
</div>
@ -236,14 +234,6 @@ export default function Home({
))}
</div>
</Container>
<Container className="mt-16">
<h2 className="text-xl font-bold tracking-tight text-zinc-800 dark:text-zinc-100 sm:text-5xl">
How to contribute
</h2>
<article className="mt-6 flex flex-col gap-y-8 text-base text-zinc-600 dark:text-zinc-400 contributing">
<MDXRemote {...contributingText} />
</article>
</Container>
</>
)
}
@ -270,12 +260,7 @@ export async function getStaticProps() {
}))
const index = await mddb.getFileByUrl('/')
const contributing = await mddb.getFileByUrl('contributing')
let indexSource = fs.readFileSync(index.file_path, { encoding: 'utf-8' })
let contributingSource = fs.readFileSync(contributing.file_path, {
encoding: 'utf-8',
})
contributingSource = await serialize(contributingSource, { parseFrontmatter: true })
indexSource = await serialize(indexSource, { parseFrontmatter: true })
const availableLanguages = [
@ -289,7 +274,6 @@ export async function getStaticProps() {
datasets,
listsOfKeywords,
indexText: indexSource,
contributingText: contributingSource,
availableLanguages,
availablePlatforms,
},

View File

@ -3,6 +3,11 @@
@import './prism.css';
@import 'tailwindcss/utilities';
.contributing li {
margin-bottom: 1.75rem;
.index-text ul,
.index-text p {
margin: 0;
}
.index-text h2 {
margin-top: 1rem;
}