import { CommentRegexes, parseComments } from './parsing.js';

/* Format a post (and optionally the start of a comment) in the format of LessWrong
   posts in the Common Crawl dataset. Below is an example prompt showing a short post
   and the start of a comment reply. Note that this format sometimes omits whitespace
   and repeats certain pieces of info, such as the title, tags and karma. It also uses
   an uncommon Unicode character (U+F141), which is rendered as 3 dots similar to an
   ellipsis in the browser but may not display correctly in text editors.

Example Post Title - LessWrong 2.0 viewer
ArchiveSequencesAbout
Search
Log In
Log InQuestionsEventsShortformAlignment ForumAF
CommentsHomeFeaturedAllTagsRecent Comments
Example Post Title
Example Author31 Jan 2020 12:34 UTC
24 points
15 commentsLW link
First tagSecond tag

Post permalinkLink without commentsLink without top nav barsLink without comments or top nav bars
Here's my post. What do you think?
Example Author31 Jan 2020 12:34 UTC
24 points
15 commentsLW link
First tagSecond tag

Post permalinkLink without commentsLink without top nav barsLink without comments or top nav bars
Comments sorted by top scores.
GW comment by Example Commenter 2020-01-31T12:35:56.789Z · score: 14 (11 votes) · LW Have you considered
*/
function formatPromptLWCommonCrawl(post, comment) {
  // Format the post, using a default title of 'Untitled', default author name of
  // 'anonymous' and default karma score of 0. The syntax mimics the way LessWong posts
  // appear in the Common Crawl dataset.
  const formattedPost = `\
${post.title || 'Untitled'} - LessWrong 2.0 viewer
ArchiveSequencesAbout
Search
Log In
Log InQuestionsEventsShortformAlignment ForumAF
CommentsHomeFeaturedAllTagsRecent Comments
${post.title || 'Untitled'}
${post.author || 'anonymous'}31 Jan 2020 12:18 UTC
${post.karma || 0} points
25 commentsLW link
${post.tags.join('')}

Post permalinkLink without commentsLink without top nav barsLink without comments or top nav bars
${post.body}
${post.author || 'anonymous'}31 Jan 2020 12:18 UTC
${post.karma || 0} points
25 commentsLW link
${post.tags.join('')}

Post permalinkLink without commentsLink without top nav barsLink without comments or top nav bars
Comments sorted by top scores.
`;

  // Format the start of the comment, using a default author name of 'anonymous' and
  // default karma score of 0.
  const formattedComment = `\
GW comment by ${comment.author || 'anonymous'} 2020-02-01T18:03:27.512Z \
· score: ${comment.karma || 0} (${Math.floor((comment.karma || 0) / 2)} votes) · LW \
${comment.body}\
`;

  let prompt = formattedPost;
  // Include the start of the comment only if the user specified parts of it. Otherwise,
  // we'll leave it out and let the model generate the comment from scratch.
  if (!comment.empty()) {
    prompt += formattedComment;
  }
  return prompt;
}

// Parse the completions from the API as forum comments. This aims to handle multiple
// variations in the format of the completions.

// If given, origComment may be a ForumComment object to serve as a starting point for
// the first comment of each completion. Any properties that evaluate to true will be
// preserved. Those that evaluate to false will be overwritten with the relevant
// information from the completion.

// This returns an array of ForumComment objects.
function parseCommentsLWCommonCrawl(reply, origComment) {
  // Examples:
  //    Darius31 Jan 2020 16:21 UTC
  //    leuzal[AF (wrong about AI) · GW(p)]31 Jan 2020 19:29 UTC
  //    commcomment by Vanessa Kosoy (vanessa-kosoy) 2020-01-31T15:04:49.440Z
  //    kwadronaut [-4] a month ago
  //    timstaudt12 hours ago [–]
  const authorRegex = /([\p{Letter}.-]{1,25}(?: [\p{Letter}.-]{1,25}){0,2}).{0,50}?\d{1,2} \p{Letter}{3,8} \d{4} \d{1,2}:\d{2} UTC|(?:comm)?[cC]omment by ([\p{Letter}.-]{1,25}(?: [\p{Letter}.-]{1,25}){0,2})|([\p{Letter}.-]{1,25}(?: [\p{Letter}.-]{1,25}){0,2})(?: ?\[[\d–-]{1,5}\] ?)?(:?\d+|a) [a-z]{1,10} ago/du;
  // Examples:
  //    1 point
  //    52 points
  //    score: 3 (3 votes)
  const scoreRegex = /(?:(\d+) points?|score:? (\d+))(?: \(\d+ votes\))?/d;
  // Examples:
  //    6 repliesLW link\nIs GPT-3 being trained on one of those pandemics?
  //    1 pointLW link\nNot deep mind?
  const bodyRegex = /(?:\d+ replies)?LW link\s?(.*)/ds;
  const regexes = new CommentRegexes(authorRegex, scoreRegex, bodyRegex);

  return parseComments(reply, origComment, regexes);
}

export {
  formatPromptLWCommonCrawl,
  parseCommentsLWCommonCrawl,
};
